diff options
Diffstat (limited to 'fs')
651 files changed, 39615 insertions, 27783 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 0ad61c6a65a5..9ff073f4090a 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -33,6 +33,7 @@ #include <linux/pagemap.h> #include <linux/idr.h> #include <linux/sched.h> +#include <linux/aio.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -147,13 +148,14 @@ static int v9fs_release_page(struct page *page, gfp_t gfp) * @offset: offset in the page */ -static void v9fs_invalidate_page(struct page *page, unsigned long offset) +static void v9fs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length) { /* * If called with zero offset, we should release * the private state assocated with the page */ - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) v9fs_fscache_invalidate_page(page); } diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index be1e34adc3c6..4d0c2e0be7e5 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -101,16 +101,15 @@ static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen) } /** - * v9fs_dir_readdir - read a directory - * @filp: opened file structure - * @dirent: directory structure ??? - * @filldir: function to populate directory structure ??? + * v9fs_dir_readdir - iterate through a directory + * @file: opened file structure + * @ctx: actor we feed the entries to * */ -static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) { - int over; + bool over; struct p9_wstat st; int err = 0; struct p9_fid *fid; @@ -118,19 +117,19 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) int reclen = 0; struct p9_rdir *rdir; - p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); - fid = filp->private_data; + p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name); + fid = file->private_data; buflen = fid->clnt->msize - P9_IOHDRSZ; - rdir = v9fs_alloc_rdir_buf(filp, buflen); + rdir = v9fs_alloc_rdir_buf(file, buflen); if (!rdir) return -ENOMEM; while (1) { if (rdir->tail == rdir->head) { - err = v9fs_file_readn(filp, rdir->buf, NULL, - buflen, filp->f_pos); + err = v9fs_file_readn(file, rdir->buf, NULL, + buflen, ctx->pos); if (err <= 0) return err; @@ -148,51 +147,45 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) } reclen = st.size+2; - over = filldir(dirent, st.name, strlen(st.name), - filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st)); - + over = !dir_emit(ctx, st.name, strlen(st.name), + v9fs_qid2ino(&st.qid), dt_type(&st)); p9stat_free(&st); - if (over) return 0; rdir->head += reclen; - filp->f_pos += reclen; + ctx->pos += reclen; } } } /** - * v9fs_dir_readdir_dotl - read a directory - * @filp: opened file structure - * @dirent: buffer to fill dirent structures - * @filldir: function to populate dirent structures + * v9fs_dir_readdir_dotl - iterate through a directory + * @file: opened file structure + * @ctx: actor we feed the entries to * */ -static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, - filldir_t filldir) +static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx) { - int over; int err = 0; struct p9_fid *fid; int buflen; struct p9_rdir *rdir; struct p9_dirent curdirent; - u64 oldoffset = 0; - p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); - fid = filp->private_data; + p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name); + fid = file->private_data; buflen = fid->clnt->msize - P9_READDIRHDRSZ; - rdir = v9fs_alloc_rdir_buf(filp, buflen); + rdir = v9fs_alloc_rdir_buf(file, buflen); if (!rdir) return -ENOMEM; while (1) { if (rdir->tail == rdir->head) { err = p9_client_readdir(fid, rdir->buf, buflen, - filp->f_pos); + ctx->pos); if (err <= 0) return err; @@ -210,22 +203,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, return -EIO; } - /* d_off in dirent structure tracks the offset into - * the next dirent in the dir. However, filldir() - * expects offset into the current dirent. Hence - * while calling filldir send the offset from the - * previous dirent structure. - */ - over = filldir(dirent, curdirent.d_name, - strlen(curdirent.d_name), - oldoffset, v9fs_qid2ino(&curdirent.qid), - curdirent.d_type); - oldoffset = curdirent.d_off; - - if (over) + if (!dir_emit(ctx, curdirent.d_name, + strlen(curdirent.d_name), + v9fs_qid2ino(&curdirent.qid), + curdirent.d_type)) return 0; - filp->f_pos = curdirent.d_off; + ctx->pos = curdirent.d_off; rdir->head += err; } } @@ -254,7 +238,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) const struct file_operations v9fs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, - .readdir = v9fs_dir_readdir, + .iterate = v9fs_dir_readdir, .open = v9fs_file_open, .release = v9fs_dir_release, }; @@ -262,7 +246,7 @@ const struct file_operations v9fs_dir_operations = { const struct file_operations v9fs_dir_operations_dotl = { .read = generic_read_dir, .llseek = generic_file_llseek, - .readdir = v9fs_dir_readdir_dotl, + .iterate = v9fs_dir_readdir_dotl, .open = v9fs_file_open, .release = v9fs_dir_release, .fsync = v9fs_file_fsync_dotl, diff --git a/fs/Kconfig b/fs/Kconfig index 780725a463b1..c229f828eb01 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -211,6 +211,7 @@ source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" source "fs/f2fs/Kconfig" +source "fs/efivarfs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 0efd1524b977..370b24cee4d8 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -65,6 +65,20 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS This config option changes the default setting of coredump_filter seen at boot time. If unsure, say Y. +config BINFMT_SCRIPT + tristate "Kernel support for scripts starting with #!" + default y + help + Say Y here if you want to execute interpreted scripts starting with + #! followed by the path to an interpreter. + + You can build this support as a module; however, until that module + gets loaded, you cannot run scripts. Thus, if you want to load this + module from an initramfs, the portion of the initramfs before loading + this module must consist of compiled binaries only. + + Most systems will not boot if you say M or N here. If unsure, say Y. + config BINFMT_FLAT bool "Kernel support for flat binaries" depends on !MMU && (!FRV || BROKEN) diff --git a/fs/Makefile b/fs/Makefile index 9d53192236fc..4fe6df3ec28f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -7,10 +7,10 @@ obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ - ioctl.o readdir.o select.o fifo.o dcache.o inode.o \ + ioctl.o readdir.o select.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ - pnode.o drop_caches.o splice.o sync.o utimes.o \ + pnode.o splice.o sync.o utimes.o \ stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) @@ -34,10 +34,7 @@ obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o - -# binfmt_script is always there -obj-y += binfmt_script.o - +obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o @@ -49,6 +46,7 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o obj-$(CONFIG_COREDUMP) += coredump.o +obj-$(CONFIG_SYSCTL) += drop_caches.o obj-$(CONFIG_FHANDLE) += fhandle.o @@ -127,3 +125,4 @@ obj-$(CONFIG_F2FS_FS) += f2fs/ obj-y += exofs/ # Multiple modules obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ +obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 9cf874ce8336..0d138c0de293 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -17,47 +17,43 @@ static DEFINE_RWLOCK(adfs_dir_lock); static int -adfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +adfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; struct object_info obj; struct adfs_dir dir; int ret = 0; - if (filp->f_pos >> 32) - goto out; + if (ctx->pos >> 32) + return 0; ret = ops->read(sb, inode->i_ino, inode->i_size, &dir); if (ret) - goto out; + return ret; - switch ((unsigned long)filp->f_pos) { - case 0: - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) + if (ctx->pos == 0) { + if (!dir_emit_dot(file, ctx)) goto free_out; - filp->f_pos += 1; - - case 1: - if (filldir(dirent, "..", 2, 1, dir.parent_id, DT_DIR) < 0) + ctx->pos = 1; + } + if (ctx->pos == 1) { + if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR)) goto free_out; - filp->f_pos += 1; - - default: - break; + ctx->pos = 2; } read_lock(&adfs_dir_lock); - ret = ops->setpos(&dir, filp->f_pos - 2); + ret = ops->setpos(&dir, ctx->pos - 2); if (ret) goto unlock_out; while (ops->getnext(&dir, &obj) == 0) { - if (filldir(dirent, obj.name, obj.name_len, - filp->f_pos, obj.file_id, DT_UNKNOWN) < 0) - goto unlock_out; - filp->f_pos += 1; + if (!dir_emit(ctx, obj.name, obj.name_len, + obj.file_id, DT_UNKNOWN)) + break; + ctx->pos++; } unlock_out: @@ -65,8 +61,6 @@ unlock_out: free_out: ops->free(&dir); - -out: return ret; } @@ -192,13 +186,12 @@ out: const struct file_operations adfs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, - .readdir = adfs_readdir, + .iterate = adfs_readdir, .fsync = generic_file_fsync, }; static int -adfs_hash(const struct dentry *parent, const struct inode *inode, - struct qstr *qstr) +adfs_hash(const struct dentry *parent, struct qstr *qstr) { const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen; const unsigned char *name; @@ -234,8 +227,7 @@ adfs_hash(const struct dentry *parent, const struct inode *inode, * requirements of the underlying filesystem. */ static int -adfs_compare(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +adfs_compare(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { int i; diff --git a/fs/affs/dir.c b/fs/affs/dir.c index fd11a6d608ee..f1eba8c3644e 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -15,12 +15,12 @@ #include "affs.h" -static int affs_readdir(struct file *, void *, filldir_t); +static int affs_readdir(struct file *, struct dir_context *); const struct file_operations affs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, - .readdir = affs_readdir, + .iterate = affs_readdir, .fsync = affs_file_fsync, }; @@ -40,52 +40,35 @@ const struct inode_operations affs_dir_inode_operations = { }; static int -affs_readdir(struct file *filp, void *dirent, filldir_t filldir) +affs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - struct buffer_head *dir_bh; - struct buffer_head *fh_bh; + struct buffer_head *dir_bh = NULL; + struct buffer_head *fh_bh = NULL; unsigned char *name; int namelen; u32 i; int hash_pos; int chain_pos; - u32 f_pos; u32 ino; - int stored; - int res; - pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)filp->f_pos); + pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos); - stored = 0; - res = -EIO; - dir_bh = NULL; - fh_bh = NULL; - f_pos = filp->f_pos; - - if (f_pos == 0) { - filp->private_data = (void *)0; - if (filldir(dirent, ".", 1, f_pos, inode->i_ino, DT_DIR) < 0) + if (ctx->pos < 2) { + file->private_data = (void *)0; + if (!dir_emit_dots(file, ctx)) return 0; - filp->f_pos = f_pos = 1; - stored++; - } - if (f_pos == 1) { - if (filldir(dirent, "..", 2, f_pos, parent_ino(filp->f_path.dentry), DT_DIR) < 0) - return stored; - filp->f_pos = f_pos = 2; - stored++; } affs_lock_dir(inode); - chain_pos = (f_pos - 2) & 0xffff; - hash_pos = (f_pos - 2) >> 16; + chain_pos = (ctx->pos - 2) & 0xffff; + hash_pos = (ctx->pos - 2) >> 16; if (chain_pos == 0xffff) { affs_warning(sb, "readdir", "More than 65535 entries in chain"); chain_pos = 0; hash_pos++; - filp->f_pos = ((hash_pos << 16) | chain_pos) + 2; + ctx->pos = ((hash_pos << 16) | chain_pos) + 2; } dir_bh = affs_bread(sb, inode->i_ino); if (!dir_bh) @@ -94,8 +77,8 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir) /* If the directory hasn't changed since the last call to readdir(), * we can jump directly to where we left off. */ - ino = (u32)(long)filp->private_data; - if (ino && filp->f_version == inode->i_version) { + ino = (u32)(long)file->private_data; + if (ino && file->f_version == inode->i_version) { pr_debug("AFFS: readdir() left off=%d\n", ino); goto inside; } @@ -105,7 +88,7 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir) fh_bh = affs_bread(sb, ino); if (!fh_bh) { affs_error(sb, "readdir","Cannot read block %d", i); - goto readdir_out; + return -EIO; } ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); affs_brelse(fh_bh); @@ -119,38 +102,34 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir) ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]); if (!ino) continue; - f_pos = (hash_pos << 16) + 2; + ctx->pos = (hash_pos << 16) + 2; inside: do { fh_bh = affs_bread(sb, ino); if (!fh_bh) { affs_error(sb, "readdir","Cannot read block %d", ino); - goto readdir_done; + break; } namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); name = AFFS_TAIL(sb, fh_bh)->name + 1; pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n", - namelen, name, ino, hash_pos, f_pos); - if (filldir(dirent, name, namelen, f_pos, ino, DT_UNKNOWN) < 0) + namelen, name, ino, hash_pos, (u32)ctx->pos); + if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN)) goto readdir_done; - stored++; - f_pos++; + ctx->pos++; ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); affs_brelse(fh_bh); fh_bh = NULL; } while (ino); } readdir_done: - filp->f_pos = f_pos; - filp->f_version = inode->i_version; - filp->private_data = (void *)(long)ino; - res = stored; + file->f_version = inode->i_version; + file->private_data = (void *)(long)ino; readdir_out: affs_brelse(dir_bh); affs_brelse(fh_bh); affs_unlock_dir(inode); - pr_debug("AFFS: readdir()=%d\n", stored); - return res; + return 0; } diff --git a/fs/affs/namei.c b/fs/affs/namei.c index ff65884a7839..c36cbb4537a2 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -13,18 +13,12 @@ typedef int (*toupper_t)(int); static int affs_toupper(int ch); -static int affs_hash_dentry(const struct dentry *, - const struct inode *, struct qstr *); -static int affs_compare_dentry(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int affs_hash_dentry(const struct dentry *, struct qstr *); +static int affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); static int affs_intl_toupper(int ch); -static int affs_intl_hash_dentry(const struct dentry *, - const struct inode *, struct qstr *); -static int affs_intl_compare_dentry(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int affs_intl_hash_dentry(const struct dentry *, struct qstr *); +static int affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); const struct dentry_operations affs_dentry_operations = { @@ -86,14 +80,12 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper) } static int -affs_hash_dentry(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { return __affs_hash_dentry(qstr, affs_toupper); } static int -affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { return __affs_hash_dentry(qstr, affs_intl_toupper); } @@ -131,15 +123,13 @@ static inline int __affs_compare_dentry(unsigned int len, } static int -affs_compare_dentry(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return __affs_compare_dentry(len, str, name, affs_toupper); } static int -affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return __affs_compare_dentry(len, str, name, affs_intl_toupper); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 7a465ed04444..34494fbead0a 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -22,7 +22,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); static int afs_dir_open(struct inode *inode, struct file *file); -static int afs_readdir(struct file *file, void *dirent, filldir_t filldir); +static int afs_readdir(struct file *file, struct dir_context *ctx); static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); static void afs_d_release(struct dentry *dentry); @@ -43,7 +43,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, const struct file_operations afs_dir_file_operations = { .open = afs_dir_open, .release = afs_release, - .readdir = afs_readdir, + .iterate = afs_readdir, .lock = afs_lock, .llseek = generic_file_llseek, }; @@ -119,9 +119,9 @@ struct afs_dir_page { }; struct afs_lookup_cookie { + struct dir_context ctx; struct afs_fid fid; - const char *name; - size_t nlen; + struct qstr name; int found; }; @@ -228,20 +228,18 @@ static int afs_dir_open(struct inode *inode, struct file *file) /* * deal with one block in an AFS directory */ -static int afs_dir_iterate_block(unsigned *fpos, +static int afs_dir_iterate_block(struct dir_context *ctx, union afs_dir_block *block, - unsigned blkoff, - void *cookie, - filldir_t filldir) + unsigned blkoff) { union afs_dirent *dire; unsigned offset, next, curr; size_t nlen; - int tmp, ret; + int tmp; - _enter("%u,%x,%p,,",*fpos,blkoff,block); + _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block); - curr = (*fpos - blkoff) / sizeof(union afs_dirent); + curr = (ctx->pos - blkoff) / sizeof(union afs_dirent); /* walk through the block, an entry at a time */ for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries; @@ -256,7 +254,7 @@ static int afs_dir_iterate_block(unsigned *fpos, _debug("ENT[%Zu.%u]: unused", blkoff / sizeof(union afs_dir_block), offset); if (offset >= curr) - *fpos = blkoff + + ctx->pos = blkoff + next * sizeof(union afs_dirent); continue; } @@ -302,19 +300,15 @@ static int afs_dir_iterate_block(unsigned *fpos, continue; /* found the next entry */ - ret = filldir(cookie, - dire->u.name, - nlen, - blkoff + offset * sizeof(union afs_dirent), + if (!dir_emit(ctx, dire->u.name, nlen, ntohl(dire->u.vnode), - filldir == afs_lookup_filldir ? - ntohl(dire->u.unique) : DT_UNKNOWN); - if (ret < 0) { + ctx->actor == afs_lookup_filldir ? + ntohl(dire->u.unique) : DT_UNKNOWN)) { _leave(" = 0 [full]"); return 0; } - *fpos = blkoff + next * sizeof(union afs_dirent); + ctx->pos = blkoff + next * sizeof(union afs_dirent); } _leave(" = 1 [more]"); @@ -324,8 +318,8 @@ static int afs_dir_iterate_block(unsigned *fpos, /* * iterate through the data blob that lists the contents of an AFS directory */ -static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie, - filldir_t filldir, struct key *key) +static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, + struct key *key) { union afs_dir_block *dblock; struct afs_dir_page *dbuf; @@ -333,7 +327,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie, unsigned blkoff, limit; int ret; - _enter("{%lu},%u,,", dir->i_ino, *fpos); + _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos); if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) { _leave(" = -ESTALE"); @@ -341,13 +335,13 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie, } /* round the file position up to the next entry boundary */ - *fpos += sizeof(union afs_dirent) - 1; - *fpos &= ~(sizeof(union afs_dirent) - 1); + ctx->pos += sizeof(union afs_dirent) - 1; + ctx->pos &= ~(sizeof(union afs_dirent) - 1); /* walk through the blocks in sequence */ ret = 0; - while (*fpos < dir->i_size) { - blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1); + while (ctx->pos < dir->i_size) { + blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1); /* fetch the appropriate page from the directory */ page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key); @@ -364,8 +358,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie, do { dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) / sizeof(union afs_dir_block)]; - ret = afs_dir_iterate_block(fpos, dblock, blkoff, - cookie, filldir); + ret = afs_dir_iterate_block(ctx, dblock, blkoff); if (ret != 1) { afs_dir_put_page(page); goto out; @@ -373,7 +366,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie, blkoff += sizeof(union afs_dir_block); - } while (*fpos < dir->i_size && blkoff < limit); + } while (ctx->pos < dir->i_size && blkoff < limit); afs_dir_put_page(page); ret = 0; @@ -387,23 +380,10 @@ out: /* * read an AFS directory */ -static int afs_readdir(struct file *file, void *cookie, filldir_t filldir) +static int afs_readdir(struct file *file, struct dir_context *ctx) { - unsigned fpos; - int ret; - - _enter("{%Ld,{%lu}}", - file->f_pos, file_inode(file)->i_ino); - - ASSERT(file->private_data != NULL); - - fpos = file->f_pos; - ret = afs_dir_iterate(file_inode(file), &fpos, - cookie, filldir, file->private_data); - file->f_pos = fpos; - - _leave(" = %d", ret); - return ret; + return afs_dir_iterate(file_inode(file), + ctx, file->private_data); } /* @@ -416,15 +396,16 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, { struct afs_lookup_cookie *cookie = _cookie; - _enter("{%s,%Zu},%s,%u,,%llu,%u", - cookie->name, cookie->nlen, name, nlen, + _enter("{%s,%u},%s,%u,,%llu,%u", + cookie->name.name, cookie->name.len, name, nlen, (unsigned long long) ino, dtype); /* insanity checks first */ BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); BUILD_BUG_ON(sizeof(union afs_dirent) != 32); - if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) { + if (cookie->name.len != nlen || + memcmp(cookie->name.name, name, nlen) != 0) { _leave(" = 0 [no]"); return 0; } @@ -444,24 +425,18 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, static int afs_do_lookup(struct inode *dir, struct dentry *dentry, struct afs_fid *fid, struct key *key) { - struct afs_lookup_cookie cookie; - struct afs_super_info *as; - unsigned fpos; + struct afs_super_info *as = dir->i_sb->s_fs_info; + struct afs_lookup_cookie cookie = { + .ctx.actor = afs_lookup_filldir, + .name = dentry->d_name, + .fid.vid = as->volume->vid + }; int ret; _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name); - as = dir->i_sb->s_fs_info; - /* search the directory */ - cookie.name = dentry->d_name.name; - cookie.nlen = dentry->d_name.len; - cookie.fid.vid = as->volume->vid; - cookie.found = 0; - - fpos = 0; - ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir, - key); + ret = afs_dir_iterate(dir, &cookie.ctx, key); if (ret < 0) { _leave(" = %d [iter]", ret); return ret; diff --git a/fs/afs/file.c b/fs/afs/file.c index 8f6e9234d565..66d50fe2ee45 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -19,7 +19,8 @@ #include "internal.h" static int afs_readpage(struct file *file, struct page *page); -static void afs_invalidatepage(struct page *page, unsigned long offset); +static void afs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length); static int afs_releasepage(struct page *page, gfp_t gfp_flags); static int afs_launder_page(struct page *page); @@ -310,16 +311,17 @@ static int afs_launder_page(struct page *page) * - release a page and clean up its private data if offset is 0 (indicating * the entire page) */ -static void afs_invalidatepage(struct page *page, unsigned long offset) +static void afs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct afs_writeback *wb = (struct afs_writeback *) page_private(page); - _enter("{%lu},%lu", page->index, offset); + _enter("{%lu},%u,%u", page->index, offset, length); BUG_ON(!PageLocked(page)); /* we clean up only if the entire page is being invalidated */ - if (offset == 0) { + if (offset == 0 && length == PAGE_CACHE_SIZE) { #ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page)) { struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 2497bf306c70..a8cf2cff836c 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -252,7 +252,8 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key) */ static int afs_do_setlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); + struct inode *inode = file_inode(file); + struct afs_vnode *vnode = AFS_FS_I(inode); afs_lock_type_t type; struct key *key = file->private_data; int ret; @@ -273,7 +274,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; - lock_flocks(); + spin_lock(&inode->i_lock); /* make sure we've got a callback on this file and that our view of the * data version is up to date */ @@ -420,7 +421,7 @@ given_lock: afs_vnode_fetch_status(vnode, NULL, key); error: - unlock_flocks(); + spin_unlock(&inode->i_lock); _leave(" = %d", ret); return ret; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 096b23f821a1..526e4bbbde59 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -190,7 +190,7 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file) return ret; m = file->private_data; - m->private = PDE(inode)->data; + m->private = PDE_DATA(inode); return 0; } @@ -448,7 +448,7 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file) struct seq_file *m; int ret; - cell = PDE(inode)->data; + cell = PDE_DATA(inode); if (!cell) return -ENOENT; @@ -554,7 +554,7 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) struct seq_file *m; int ret; - cell = PDE(inode)->data; + cell = PDE_DATA(inode); if (!cell) return -ENOENT; @@ -659,7 +659,7 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file) struct seq_file *m; int ret; - cell = PDE(inode)->data; + cell = PDE_DATA(inode); if (!cell) return -ENOENT; diff --git a/fs/afs/write.c b/fs/afs/write.c index 7e03eadb40c0..a890db4b9898 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/pagevec.h> +#include <linux/aio.h> #include "internal.h" static int afs_write_back_from_locked_page(struct afs_writeback *wb, @@ -8,6 +8,8 @@ * * See ../COPYING for licensing terms. */ +#define pr_fmt(fmt) "%s: " fmt, __func__ + #include <linux/kernel.h> #include <linux/init.h> #include <linux/errno.h> @@ -18,8 +20,6 @@ #include <linux/backing-dev.h> #include <linux/uio.h> -#define DEBUG 0 - #include <linux/sched.h> #include <linux/fs.h> #include <linux/file.h> @@ -39,11 +39,78 @@ #include <asm/kmap_types.h> #include <asm/uaccess.h> -#if DEBUG > 1 -#define dprintk printk -#else -#define dprintk(x...) do { ; } while (0) -#endif +#include "internal.h" + +#define AIO_RING_MAGIC 0xa10a10a1 +#define AIO_RING_COMPAT_FEATURES 1 +#define AIO_RING_INCOMPAT_FEATURES 0 +struct aio_ring { + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; + unsigned tail; + + unsigned magic; + unsigned compat_features; + unsigned incompat_features; + unsigned header_length; /* size of aio_ring */ + + + struct io_event io_events[0]; +}; /* 128 bytes + ring size */ + +#define AIO_RING_PAGES 8 + +struct kioctx { + atomic_t users; + atomic_t dead; + + /* This needs improving */ + unsigned long user_id; + struct hlist_node list; + + /* + * This is what userspace passed to io_setup(), it's not used for + * anything but counting against the global max_reqs quota. + * + * The real limit is nr_events - 1, which will be larger (see + * aio_setup_ring()) + */ + unsigned max_reqs; + + /* Size of ringbuffer, in units of struct io_event */ + unsigned nr_events; + + unsigned long mmap_base; + unsigned long mmap_size; + + struct page **ring_pages; + long nr_pages; + + struct rcu_head rcu_head; + struct work_struct rcu_work; + + struct { + atomic_t reqs_active; + } ____cacheline_aligned_in_smp; + + struct { + spinlock_t ctx_lock; + struct list_head active_reqs; /* used for cancellation */ + } ____cacheline_aligned_in_smp; + + struct { + struct mutex ring_lock; + wait_queue_head_t wait; + } ____cacheline_aligned_in_smp; + + struct { + unsigned tail; + spinlock_t completion_lock; + } ____cacheline_aligned_in_smp; + + struct page *internal_pages[AIO_RING_PAGES]; +}; /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); @@ -54,11 +121,6 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; -static struct workqueue_struct *aio_wq; - -static void aio_kick_handler(struct work_struct *); -static void aio_queue_work(struct kioctx *); - /* aio_setup * Creates the slab caches used by the aio routines, panic on * failure as this is done early during the boot sequence. @@ -68,10 +130,7 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); - aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */ - BUG_ON(!aio_wq); - - pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); + pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); return 0; } @@ -79,28 +138,20 @@ __initcall(aio_setup); static void aio_free_ring(struct kioctx *ctx) { - struct aio_ring_info *info = &ctx->ring_info; long i; - for (i=0; i<info->nr_pages; i++) - put_page(info->ring_pages[i]); + for (i = 0; i < ctx->nr_pages; i++) + put_page(ctx->ring_pages[i]); - if (info->mmap_size) { - BUG_ON(ctx->mm != current->mm); - vm_munmap(info->mmap_base, info->mmap_size); - } - - if (info->ring_pages && info->ring_pages != info->internal_pages) - kfree(info->ring_pages); - info->ring_pages = NULL; - info->nr = 0; + if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) + kfree(ctx->ring_pages); } static int aio_setup_ring(struct kioctx *ctx) { struct aio_ring *ring; - struct aio_ring_info *info = &ctx->ring_info; unsigned nr_events = ctx->max_reqs; + struct mm_struct *mm = current->mm; unsigned long size, populate; int nr_pages; @@ -116,46 +167,44 @@ static int aio_setup_ring(struct kioctx *ctx) nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); - info->nr = 0; - info->ring_pages = info->internal_pages; + ctx->nr_events = 0; + ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { - info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!info->ring_pages) + ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!ctx->ring_pages) return -ENOMEM; } - info->mmap_size = nr_pages * PAGE_SIZE; - dprintk("attempting mmap of %lu bytes\n", info->mmap_size); - down_write(&ctx->mm->mmap_sem); - info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, - PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, 0, - &populate); - if (IS_ERR((void *)info->mmap_base)) { - up_write(&ctx->mm->mmap_sem); - info->mmap_size = 0; + ctx->mmap_size = nr_pages * PAGE_SIZE; + pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); + down_write(&mm->mmap_sem); + ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, + PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); + if (IS_ERR((void *)ctx->mmap_base)) { + up_write(&mm->mmap_sem); + ctx->mmap_size = 0; aio_free_ring(ctx); return -EAGAIN; } - dprintk("mmap address: 0x%08lx\n", info->mmap_base); - info->nr_pages = get_user_pages(current, ctx->mm, - info->mmap_base, nr_pages, - 1, 0, info->ring_pages, NULL); - up_write(&ctx->mm->mmap_sem); + pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); + ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, + 1, 0, ctx->ring_pages, NULL); + up_write(&mm->mmap_sem); - if (unlikely(info->nr_pages != nr_pages)) { + if (unlikely(ctx->nr_pages != nr_pages)) { aio_free_ring(ctx); return -EAGAIN; } if (populate) - mm_populate(info->mmap_base, populate); - - ctx->user_id = info->mmap_base; + mm_populate(ctx->mmap_base, populate); - info->nr = nr_events; /* trusted copy */ + ctx->user_id = ctx->mmap_base; + ctx->nr_events = nr_events; /* trusted copy */ - ring = kmap_atomic(info->ring_pages[0]); + ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ ring->id = ctx->user_id; ring->head = ring->tail = 0; @@ -164,72 +213,130 @@ static int aio_setup_ring(struct kioctx *ctx) ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); return 0; } - -/* aio_ring_event: returns a pointer to the event at the given index from - * kmap_atomic(). Release the pointer with put_aio_ring_event(); - */ #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr) ({ \ - unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ - struct io_event *__event; \ - __event = kmap_atomic( \ - (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \ - __event += pos % AIO_EVENTS_PER_PAGE; \ - __event; \ -}) - -#define put_aio_ring_event(event) do { \ - struct io_event *__event = (event); \ - (void)__event; \ - kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \ -} while(0) - -static void ctx_rcu_free(struct rcu_head *head) +void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) +{ + struct kioctx *ctx = req->ki_ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + + if (!req->ki_list.next) + list_add(&req->ki_list, &ctx->active_reqs); + + req->ki_cancel = cancel; + + spin_unlock_irqrestore(&ctx->ctx_lock, flags); +} +EXPORT_SYMBOL(kiocb_set_cancel_fn); + +static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, + struct io_event *res) +{ + kiocb_cancel_fn *old, *cancel; + int ret = -EINVAL; + + /* + * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it + * actually has a cancel function, hence the cmpxchg() + */ + + cancel = ACCESS_ONCE(kiocb->ki_cancel); + do { + if (!cancel || cancel == KIOCB_CANCELLED) + return ret; + + old = cancel; + cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); + } while (cancel != old); + + atomic_inc(&kiocb->ki_users); + spin_unlock_irq(&ctx->ctx_lock); + + memset(res, 0, sizeof(*res)); + res->obj = (u64)(unsigned long)kiocb->ki_obj.user; + res->data = kiocb->ki_user_data; + ret = cancel(kiocb, res); + + spin_lock_irq(&ctx->ctx_lock); + + return ret; +} + +static void free_ioctx_rcu(struct rcu_head *head) { struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); kmem_cache_free(kioctx_cachep, ctx); } -/* __put_ioctx - * Called when the last user of an aio context has gone away, - * and the struct needs to be freed. +/* + * When this function runs, the kioctx has been removed from the "hash table" + * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - + * now it's safe to cancel any that need to be. */ -static void __put_ioctx(struct kioctx *ctx) +static void free_ioctx(struct kioctx *ctx) { - unsigned nr_events = ctx->max_reqs; - BUG_ON(ctx->reqs_active); + struct aio_ring *ring; + struct io_event res; + struct kiocb *req; + unsigned head, avail; - cancel_delayed_work_sync(&ctx->wq); - aio_free_ring(ctx); - mmdrop(ctx->mm); - ctx->mm = NULL; - if (nr_events) { - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - nr_events > aio_nr); - aio_nr -= nr_events; - spin_unlock(&aio_nr_lock); + spin_lock_irq(&ctx->ctx_lock); + + while (!list_empty(&ctx->active_reqs)) { + req = list_first_entry(&ctx->active_reqs, + struct kiocb, ki_list); + + list_del_init(&req->ki_list); + kiocb_cancel(ctx, req, &res); } - pr_debug("__put_ioctx: freeing %p\n", ctx); - call_rcu(&ctx->rcu_head, ctx_rcu_free); -} -static inline int try_get_ioctx(struct kioctx *kioctx) -{ - return atomic_inc_not_zero(&kioctx->users); + spin_unlock_irq(&ctx->ctx_lock); + + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; + kunmap_atomic(ring); + + while (atomic_read(&ctx->reqs_active) > 0) { + wait_event(ctx->wait, + head != ctx->tail || + atomic_read(&ctx->reqs_active) <= 0); + + avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; + + atomic_sub(avail, &ctx->reqs_active); + head += avail; + head %= ctx->nr_events; + } + + WARN_ON(atomic_read(&ctx->reqs_active) < 0); + + aio_free_ring(ctx); + + pr_debug("freeing %p\n", ctx); + + /* + * Here the call_rcu() is between the wait_event() for reqs_active to + * hit 0, and freeing the ioctx. + * + * aio_complete() decrements reqs_active, but it has to touch the ioctx + * after to issue a wakeup so we use rcu. + */ + call_rcu(&ctx->rcu_head, free_ioctx_rcu); } -static inline void put_ioctx(struct kioctx *kioctx) +static void put_ioctx(struct kioctx *ctx) { - BUG_ON(atomic_read(&kioctx->users) <= 0); - if (unlikely(atomic_dec_and_test(&kioctx->users))) - __put_ioctx(kioctx); + if (unlikely(atomic_dec_and_test(&ctx->users))) + free_ioctx(ctx); } /* ioctx_alloc @@ -237,7 +344,7 @@ static inline void put_ioctx(struct kioctx *kioctx) */ static struct kioctx *ioctx_alloc(unsigned nr_events) { - struct mm_struct *mm; + struct mm_struct *mm = current->mm; struct kioctx *ctx; int err = -ENOMEM; @@ -256,17 +363,15 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(-ENOMEM); ctx->max_reqs = nr_events; - mm = ctx->mm = current->mm; - atomic_inc(&mm->mm_count); atomic_set(&ctx->users, 2); + atomic_set(&ctx->dead, 0); spin_lock_init(&ctx->ctx_lock); - spin_lock_init(&ctx->ring_info.ring_lock); + spin_lock_init(&ctx->completion_lock); + mutex_init(&ctx->ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); - INIT_LIST_HEAD(&ctx->run_list); - INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler); if (aio_setup_ring(ctx) < 0) goto out_freectx; @@ -286,64 +391,63 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); spin_unlock(&mm->ioctx_lock); - dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", - ctx, ctx->user_id, current->mm, ctx->ring_info.nr); + pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", + ctx, ctx->user_id, mm, ctx->nr_events); return ctx; out_cleanup: err = -EAGAIN; aio_free_ring(ctx); out_freectx: - mmdrop(mm); kmem_cache_free(kioctx_cachep, ctx); - dprintk("aio: error allocating ioctx %d\n", err); + pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); } -/* kill_ctx - * Cancels all outstanding aio requests on an aio context. Used - * when the processes owning a context have all exited to encourage +static void kill_ioctx_work(struct work_struct *work) +{ + struct kioctx *ctx = container_of(work, struct kioctx, rcu_work); + + wake_up_all(&ctx->wait); + put_ioctx(ctx); +} + +static void kill_ioctx_rcu(struct rcu_head *head) +{ + struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + + INIT_WORK(&ctx->rcu_work, kill_ioctx_work); + schedule_work(&ctx->rcu_work); +} + +/* kill_ioctx + * Cancels all outstanding aio requests on an aio context. Used + * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void kill_ctx(struct kioctx *ctx) +static void kill_ioctx(struct kioctx *ctx) { - int (*cancel)(struct kiocb *, struct io_event *); - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - struct io_event res; + if (!atomic_xchg(&ctx->dead, 1)) { + hlist_del_rcu(&ctx->list); - spin_lock_irq(&ctx->ctx_lock); - ctx->dead = 1; - while (!list_empty(&ctx->active_reqs)) { - struct list_head *pos = ctx->active_reqs.next; - struct kiocb *iocb = list_kiocb(pos); - list_del_init(&iocb->ki_list); - cancel = iocb->ki_cancel; - kiocbSetCancelled(iocb); - if (cancel) { - iocb->ki_users++; - spin_unlock_irq(&ctx->ctx_lock); - cancel(iocb, &res); - spin_lock_irq(&ctx->ctx_lock); - } - } + /* + * It'd be more correct to do this in free_ioctx(), after all + * the outstanding kiocbs have finished - but by then io_destroy + * has already returned, so io_setup() could potentially return + * -EAGAIN with no ioctxs actually in use (as far as userspace + * could tell). + */ + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - ctx->max_reqs > aio_nr); + aio_nr -= ctx->max_reqs; + spin_unlock(&aio_nr_lock); - if (!ctx->reqs_active) - goto out; + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); - add_wait_queue(&ctx->wait, &wait); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - while (ctx->reqs_active) { - spin_unlock_irq(&ctx->ctx_lock); - io_schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - spin_lock_irq(&ctx->ctx_lock); + /* Between hlist_del_rcu() and dropping the initial ref */ + call_rcu(&ctx->rcu_head, kill_ioctx_rcu); } - __set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(&ctx->wait, &wait); - -out: - spin_unlock_irq(&ctx->ctx_lock); } /* wait_on_sync_kiocb: @@ -351,9 +455,9 @@ out: */ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { - while (iocb->ki_users) { + while (atomic_read(&iocb->ki_users)) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!iocb->ki_users) + if (!atomic_read(&iocb->ki_users)) break; io_schedule(); } @@ -362,28 +466,26 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) } EXPORT_SYMBOL(wait_on_sync_kiocb); -/* exit_aio: called when the last user of mm goes away. At this point, - * there is no way for any new requests to be submited or any of the - * io_* syscalls to be called on the context. However, there may be - * outstanding requests which hold references to the context; as they - * go away, they will call put_ioctx and release any pinned memory - * associated with the request (held via struct page * references). +/* + * exit_aio: called when the last user of mm goes away. At this point, there is + * no way for any new requests to be submited or any of the io_* syscalls to be + * called on the context. + * + * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on + * them. */ void exit_aio(struct mm_struct *mm) { struct kioctx *ctx; + struct hlist_node *n; - while (!hlist_empty(&mm->ioctx_list)) { - ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); - hlist_del_rcu(&ctx->list); - - kill_ctx(ctx); - + hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { if (1 != atomic_read(&ctx->users)) printk(KERN_DEBUG "exit_aio:ioctx still alive: %d %d %d\n", - atomic_read(&ctx->users), ctx->dead, - ctx->reqs_active); + atomic_read(&ctx->users), + atomic_read(&ctx->dead), + atomic_read(&ctx->reqs_active)); /* * We don't need to bother with munmap() here - * exit_mmap(mm) is coming and it'll unmap everything. @@ -391,150 +493,50 @@ void exit_aio(struct mm_struct *mm) * as indicator that it needs to unmap the area, * just set it to 0; aio_free_ring() is the only * place that uses ->mmap_size, so it's safe. - * That way we get all munmap done to current->mm - - * all other callers have ctx->mm == current->mm. */ - ctx->ring_info.mmap_size = 0; - put_ioctx(ctx); + ctx->mmap_size = 0; + + kill_ioctx(ctx); } } /* aio_get_req - * Allocate a slot for an aio request. Increments the users count + * Allocate a slot for an aio request. Increments the ki_users count * of the kioctx so that the kioctx stays around until all requests are * complete. Returns NULL if no requests are free. * - * Returns with kiocb->users set to 2. The io submit code path holds + * Returns with kiocb->ki_users set to 2. The io submit code path holds * an extra reference while submitting the i/o. * This prevents races between the aio code path referencing the * req (after submitting it) and aio_complete() freeing the req. */ -static struct kiocb *__aio_get_req(struct kioctx *ctx) +static inline struct kiocb *aio_get_req(struct kioctx *ctx) { - struct kiocb *req = NULL; + struct kiocb *req; - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); - if (unlikely(!req)) + if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) return NULL; - req->ki_flags = 0; - req->ki_users = 2; - req->ki_key = 0; - req->ki_ctx = ctx; - req->ki_cancel = NULL; - req->ki_retry = NULL; - req->ki_dtor = NULL; - req->private = NULL; - req->ki_iovec = NULL; - INIT_LIST_HEAD(&req->ki_run_list); - req->ki_eventfd = NULL; - - return req; -} - -/* - * struct kiocb's are allocated in batches to reduce the number of - * times the ctx lock is acquired and released. - */ -#define KIOCB_BATCH_SIZE 32L -struct kiocb_batch { - struct list_head head; - long count; /* number of requests left to allocate */ -}; - -static void kiocb_batch_init(struct kiocb_batch *batch, long total) -{ - INIT_LIST_HEAD(&batch->head); - batch->count = total; -} - -static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) -{ - struct kiocb *req, *n; - - if (list_empty(&batch->head)) - return; - - spin_lock_irq(&ctx->ctx_lock); - list_for_each_entry_safe(req, n, &batch->head, ki_batch) { - list_del(&req->ki_batch); - list_del(&req->ki_list); - kmem_cache_free(kiocb_cachep, req); - ctx->reqs_active--; - } - if (unlikely(!ctx->reqs_active && ctx->dead)) - wake_up_all(&ctx->wait); - spin_unlock_irq(&ctx->ctx_lock); -} - -/* - * Allocate a batch of kiocbs. This avoids taking and dropping the - * context lock a lot during setup. - */ -static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) -{ - unsigned short allocated, to_alloc; - long avail; - struct kiocb *req, *n; - struct aio_ring *ring; - - to_alloc = min(batch->count, KIOCB_BATCH_SIZE); - for (allocated = 0; allocated < to_alloc; allocated++) { - req = __aio_get_req(ctx); - if (!req) - /* allocation failed, go with what we've got */ - break; - list_add(&req->ki_batch, &batch->head); - } - - if (allocated == 0) - goto out; - - spin_lock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_info.ring_pages[0]); - - avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active; - BUG_ON(avail < 0); - if (avail < allocated) { - /* Trim back the number of requests. */ - list_for_each_entry_safe(req, n, &batch->head, ki_batch) { - list_del(&req->ki_batch); - kmem_cache_free(kiocb_cachep, req); - if (--allocated <= avail) - break; - } - } - - batch->count -= allocated; - list_for_each_entry(req, &batch->head, ki_batch) { - list_add(&req->ki_list, &ctx->active_reqs); - ctx->reqs_active++; - } - - kunmap_atomic(ring); - spin_unlock_irq(&ctx->ctx_lock); + if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1) + goto out_put; -out: - return allocated; -} + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); + if (unlikely(!req)) + goto out_put; -static inline struct kiocb *aio_get_req(struct kioctx *ctx, - struct kiocb_batch *batch) -{ - struct kiocb *req; + atomic_set(&req->ki_users, 2); + req->ki_ctx = ctx; - if (list_empty(&batch->head)) - if (kiocb_batch_refill(ctx, batch) == 0) - return NULL; - req = list_first_entry(&batch->head, struct kiocb, ki_batch); - list_del(&req->ki_batch); return req; +out_put: + atomic_dec(&ctx->reqs_active); + return NULL; } -static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) +static void kiocb_free(struct kiocb *req) { - assert_spin_locked(&ctx->ctx_lock); - + if (req->ki_filp) + fput(req->ki_filp); if (req->ki_eventfd != NULL) eventfd_ctx_put(req->ki_eventfd); if (req->ki_dtor) @@ -542,48 +544,12 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) if (req->ki_iovec != &req->ki_inline_vec) kfree(req->ki_iovec); kmem_cache_free(kiocb_cachep, req); - ctx->reqs_active--; - - if (unlikely(!ctx->reqs_active && ctx->dead)) - wake_up_all(&ctx->wait); } -/* __aio_put_req - * Returns true if this put was the last user of the request. - */ -static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) +void aio_put_req(struct kiocb *req) { - dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", - req, atomic_long_read(&req->ki_filp->f_count)); - - assert_spin_locked(&ctx->ctx_lock); - - req->ki_users--; - BUG_ON(req->ki_users < 0); - if (likely(req->ki_users)) - return 0; - list_del(&req->ki_list); /* remove from active_reqs */ - req->ki_cancel = NULL; - req->ki_retry = NULL; - - fput(req->ki_filp); - req->ki_filp = NULL; - really_put_req(ctx, req); - return 1; -} - -/* aio_put_req - * Returns true if this put was the last user of the kiocb, - * false if the request is still in use. - */ -int aio_put_req(struct kiocb *req) -{ - struct kioctx *ctx = req->ki_ctx; - int ret; - spin_lock_irq(&ctx->ctx_lock); - ret = __aio_put_req(ctx, req); - spin_unlock_irq(&ctx->ctx_lock); - return ret; + if (atomic_dec_and_test(&req->ki_users)) + kiocb_free(req); } EXPORT_SYMBOL(aio_put_req); @@ -595,13 +561,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) rcu_read_lock(); hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { - /* - * RCU protects us against accessing freed memory but - * we have to be careful not to get a reference when the - * reference count already dropped to 0 (ctx->dead test - * is unreliable because of races). - */ - if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){ + if (ctx->user_id == ctx_id) { + atomic_inc(&ctx->users); ret = ctx; break; } @@ -611,295 +572,16 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) return ret; } -/* - * Queue up a kiocb to be retried. Assumes that the kiocb - * has already been marked as kicked, and places it on - * the retry run list for the corresponding ioctx, if it - * isn't already queued. Returns 1 if it actually queued - * the kiocb (to tell the caller to activate the work - * queue to process it), or 0, if it found that it was - * already queued. - */ -static inline int __queue_kicked_iocb(struct kiocb *iocb) -{ - struct kioctx *ctx = iocb->ki_ctx; - - assert_spin_locked(&ctx->ctx_lock); - - if (list_empty(&iocb->ki_run_list)) { - list_add_tail(&iocb->ki_run_list, - &ctx->run_list); - return 1; - } - return 0; -} - -/* aio_run_iocb - * This is the core aio execution routine. It is - * invoked both for initial i/o submission and - * subsequent retries via the aio_kick_handler. - * Expects to be invoked with iocb->ki_ctx->lock - * already held. The lock is released and reacquired - * as needed during processing. - * - * Calls the iocb retry method (already setup for the - * iocb on initial submission) for operation specific - * handling, but takes care of most of common retry - * execution details for a given iocb. The retry method - * needs to be non-blocking as far as possible, to avoid - * holding up other iocbs waiting to be serviced by the - * retry kernel thread. - * - * The trickier parts in this code have to do with - * ensuring that only one retry instance is in progress - * for a given iocb at any time. Providing that guarantee - * simplifies the coding of individual aio operations as - * it avoids various potential races. - */ -static ssize_t aio_run_iocb(struct kiocb *iocb) -{ - struct kioctx *ctx = iocb->ki_ctx; - ssize_t (*retry)(struct kiocb *); - ssize_t ret; - - if (!(retry = iocb->ki_retry)) { - printk("aio_run_iocb: iocb->ki_retry = NULL\n"); - return 0; - } - - /* - * We don't want the next retry iteration for this - * operation to start until this one has returned and - * updated the iocb state. However, wait_queue functions - * can trigger a kick_iocb from interrupt context in the - * meantime, indicating that data is available for the next - * iteration. We want to remember that and enable the - * next retry iteration _after_ we are through with - * this one. - * - * So, in order to be able to register a "kick", but - * prevent it from being queued now, we clear the kick - * flag, but make the kick code *think* that the iocb is - * still on the run list until we are actually done. - * When we are done with this iteration, we check if - * the iocb was kicked in the meantime and if so, queue - * it up afresh. - */ - - kiocbClearKicked(iocb); - - /* - * This is so that aio_complete knows it doesn't need to - * pull the iocb off the run list (We can't just call - * INIT_LIST_HEAD because we don't want a kick_iocb to - * queue this on the run list yet) - */ - iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; - spin_unlock_irq(&ctx->ctx_lock); - - /* Quit retrying if the i/o has been cancelled */ - if (kiocbIsCancelled(iocb)) { - ret = -EINTR; - aio_complete(iocb, ret, 0); - /* must not access the iocb after this */ - goto out; - } - - /* - * Now we are all set to call the retry method in async - * context. - */ - ret = retry(iocb); - - if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { - /* - * There's no easy way to restart the syscall since other AIO's - * may be already running. Just fail this IO with EINTR. - */ - if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || - ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)) - ret = -EINTR; - aio_complete(iocb, ret, 0); - } -out: - spin_lock_irq(&ctx->ctx_lock); - - if (-EIOCBRETRY == ret) { - /* - * OK, now that we are done with this iteration - * and know that there is more left to go, - * this is where we let go so that a subsequent - * "kick" can start the next iteration - */ - - /* will make __queue_kicked_iocb succeed from here on */ - INIT_LIST_HEAD(&iocb->ki_run_list); - /* we must queue the next iteration ourselves, if it - * has already been kicked */ - if (kiocbIsKicked(iocb)) { - __queue_kicked_iocb(iocb); - - /* - * __queue_kicked_iocb will always return 1 here, because - * iocb->ki_run_list is empty at this point so it should - * be safe to unconditionally queue the context into the - * work queue. - */ - aio_queue_work(ctx); - } - } - return ret; -} - -/* - * __aio_run_iocbs: - * Process all pending retries queued on the ioctx - * run list. - * Assumes it is operating within the aio issuer's mm - * context. - */ -static int __aio_run_iocbs(struct kioctx *ctx) -{ - struct kiocb *iocb; - struct list_head run_list; - - assert_spin_locked(&ctx->ctx_lock); - - list_replace_init(&ctx->run_list, &run_list); - while (!list_empty(&run_list)) { - iocb = list_entry(run_list.next, struct kiocb, - ki_run_list); - list_del(&iocb->ki_run_list); - /* - * Hold an extra reference while retrying i/o. - */ - iocb->ki_users++; /* grab extra reference */ - aio_run_iocb(iocb); - __aio_put_req(ctx, iocb); - } - if (!list_empty(&ctx->run_list)) - return 1; - return 0; -} - -static void aio_queue_work(struct kioctx * ctx) -{ - unsigned long timeout; - /* - * if someone is waiting, get the work started right - * away, otherwise, use a longer delay - */ - smp_mb(); - if (waitqueue_active(&ctx->wait)) - timeout = 1; - else - timeout = HZ/10; - queue_delayed_work(aio_wq, &ctx->wq, timeout); -} - -/* - * aio_run_all_iocbs: - * Process all pending retries queued on the ioctx - * run list, and keep running them until the list - * stays empty. - * Assumes it is operating within the aio issuer's mm context. - */ -static inline void aio_run_all_iocbs(struct kioctx *ctx) -{ - spin_lock_irq(&ctx->ctx_lock); - while (__aio_run_iocbs(ctx)) - ; - spin_unlock_irq(&ctx->ctx_lock); -} - -/* - * aio_kick_handler: - * Work queue handler triggered to process pending - * retries on an ioctx. Takes on the aio issuer's - * mm context before running the iocbs, so that - * copy_xxx_user operates on the issuer's address - * space. - * Run on aiod's context. - */ -static void aio_kick_handler(struct work_struct *work) -{ - struct kioctx *ctx = container_of(work, struct kioctx, wq.work); - mm_segment_t oldfs = get_fs(); - struct mm_struct *mm; - int requeue; - - set_fs(USER_DS); - use_mm(ctx->mm); - spin_lock_irq(&ctx->ctx_lock); - requeue =__aio_run_iocbs(ctx); - mm = ctx->mm; - spin_unlock_irq(&ctx->ctx_lock); - unuse_mm(mm); - set_fs(oldfs); - /* - * we're in a worker thread already; no point using non-zero delay - */ - if (requeue) - queue_delayed_work(aio_wq, &ctx->wq, 0); -} - - -/* - * Called by kick_iocb to queue the kiocb for retry - * and if required activate the aio work queue to process - * it - */ -static void try_queue_kicked_iocb(struct kiocb *iocb) -{ - struct kioctx *ctx = iocb->ki_ctx; - unsigned long flags; - int run = 0; - - spin_lock_irqsave(&ctx->ctx_lock, flags); - /* set this inside the lock so that we can't race with aio_run_iocb() - * testing it and putting the iocb on the run list under the lock */ - if (!kiocbTryKick(iocb)) - run = __queue_kicked_iocb(iocb); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - if (run) - aio_queue_work(ctx); -} - -/* - * kick_iocb: - * Called typically from a wait queue callback context - * to trigger a retry of the iocb. - * The retry is usually executed by aio workqueue - * threads (See aio_kick_handler). - */ -void kick_iocb(struct kiocb *iocb) -{ - /* sync iocbs are easy: they can only ever be executing from a - * single context. */ - if (is_sync_kiocb(iocb)) { - kiocbSetKicked(iocb); - wake_up_process(iocb->ki_obj.tsk); - return; - } - - try_queue_kicked_iocb(iocb); -} -EXPORT_SYMBOL(kick_iocb); - /* aio_complete * Called when the io request on the given iocb is complete. - * Returns true if this is the last user of the request. The - * only other user of the request can be the cancellation code. */ -int aio_complete(struct kiocb *iocb, long res, long res2) +void aio_complete(struct kiocb *iocb, long res, long res2) { struct kioctx *ctx = iocb->ki_ctx; - struct aio_ring_info *info; struct aio_ring *ring; - struct io_event *event; + struct io_event *ev_page, *event; unsigned long flags; - unsigned long tail; - int ret; + unsigned tail, pos; /* * Special case handling for sync iocbs: @@ -909,61 +591,81 @@ int aio_complete(struct kiocb *iocb, long res, long res2) * - the sync task helpfully left a reference to itself in the iocb */ if (is_sync_kiocb(iocb)) { - BUG_ON(iocb->ki_users != 1); + BUG_ON(atomic_read(&iocb->ki_users) != 1); iocb->ki_user_data = res; - iocb->ki_users = 0; + atomic_set(&iocb->ki_users, 0); wake_up_process(iocb->ki_obj.tsk); - return 1; + return; } - info = &ctx->ring_info; - - /* add a completion event to the ring buffer. - * must be done holding ctx->ctx_lock to prevent - * other code from messing with the tail - * pointer since we might be called from irq - * context. + /* + * Take rcu_read_lock() in case the kioctx is being destroyed, as we + * need to issue a wakeup after decrementing reqs_active. */ - spin_lock_irqsave(&ctx->ctx_lock, flags); + rcu_read_lock(); - if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) - list_del_init(&iocb->ki_run_list); + if (iocb->ki_list.next) { + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + list_del(&iocb->ki_list); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + } /* * cancelled requests don't get events, userland was given one * when the event got cancelled. */ - if (kiocbIsCancelled(iocb)) + if (unlikely(xchg(&iocb->ki_cancel, + KIOCB_CANCELLED) == KIOCB_CANCELLED)) { + atomic_dec(&ctx->reqs_active); + /* Still need the wake_up in case free_ioctx is waiting */ goto put_rq; + } - ring = kmap_atomic(info->ring_pages[0]); + /* + * Add a completion event to the ring buffer. Must be done holding + * ctx->completion_lock to prevent other code from messing with the tail + * pointer since we might be called from irq context. + */ + spin_lock_irqsave(&ctx->completion_lock, flags); + + tail = ctx->tail; + pos = tail + AIO_EVENTS_OFFSET; - tail = info->tail; - event = aio_ring_event(info, tail); - if (++tail >= info->nr) + if (++tail >= ctx->nr_events) tail = 0; + ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + event = ev_page + pos % AIO_EVENTS_PER_PAGE; + event->obj = (u64)(unsigned long)iocb->ki_obj.user; event->data = iocb->ki_user_data; event->res = res; event->res2 = res2; - dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, - res, res2); + kunmap_atomic(ev_page); + flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + + pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", + ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, + res, res2); /* after flagging the request as done, we * must never even look at it again */ smp_wmb(); /* make event visible before updating tail */ - info->tail = tail; - ring->tail = tail; + ctx->tail = tail; - put_aio_ring_event(event); + ring = kmap_atomic(ctx->ring_pages[0]); + ring->tail = tail; kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); - pr_debug("added to ring %p at [%lu]\n", iocb, tail); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + pr_debug("added to ring %p at [%u]\n", iocb, tail); /* * Check if the user asked us to deliver the result through an @@ -975,7 +677,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2) put_rq: /* everything turned out well, dispose of the aiocb. */ - ret = __aio_put_req(ctx, iocb); + aio_put_req(iocb); /* * We have to order our ring_info tail store above and test @@ -988,233 +690,133 @@ put_rq: if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - return ret; + rcu_read_unlock(); } EXPORT_SYMBOL(aio_complete); -/* aio_read_evt - * Pull an event off of the ioctx's event ring. Returns the number of - * events fetched (0 or 1 ;-) - * FIXME: make this use cmpxchg. - * TODO: make the ringbuffer user mmap()able (requires FIXME). +/* aio_read_events + * Pull an event off of the ioctx's event ring. Returns the number of + * events fetched */ -static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) +static long aio_read_events_ring(struct kioctx *ctx, + struct io_event __user *event, long nr) { - struct aio_ring_info *info = &ioctx->ring_info; struct aio_ring *ring; - unsigned long head; - int ret = 0; - - ring = kmap_atomic(info->ring_pages[0]); - dprintk("in aio_read_evt h%lu t%lu m%lu\n", - (unsigned long)ring->head, (unsigned long)ring->tail, - (unsigned long)ring->nr); - - if (ring->head == ring->tail) - goto out; + unsigned head, pos; + long ret = 0; + int copy_ret; - spin_lock(&info->ring_lock); - - head = ring->head % info->nr; - if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head); - *ent = *evp; - head = (head + 1) % info->nr; - smp_mb(); /* finish reading the event before updatng the head */ - ring->head = head; - ret = 1; - put_aio_ring_event(evp); - } - spin_unlock(&info->ring_lock); + mutex_lock(&ctx->ring_lock); -out: - dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, - (unsigned long)ring->head, (unsigned long)ring->tail); + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; kunmap_atomic(ring); - return ret; -} -struct aio_timeout { - struct timer_list timer; - int timed_out; - struct task_struct *p; -}; + pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); -static void timeout_func(unsigned long data) -{ - struct aio_timeout *to = (struct aio_timeout *)data; + if (head == ctx->tail) + goto out; - to->timed_out = 1; - wake_up_process(to->p); -} + while (ret < nr) { + long avail; + struct io_event *ev; + struct page *page; -static inline void init_timeout(struct aio_timeout *to) -{ - setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to); - to->timed_out = 0; - to->p = current; -} + avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; + if (head == ctx->tail) + break; -static inline void set_timeout(long start_jiffies, struct aio_timeout *to, - const struct timespec *ts) -{ - to->timer.expires = start_jiffies + timespec_to_jiffies(ts); - if (time_after(to->timer.expires, jiffies)) - add_timer(&to->timer); - else - to->timed_out = 1; -} + avail = min(avail, nr - ret); + avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - + ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); -static inline void clear_timeout(struct aio_timeout *to) -{ - del_singleshot_timer_sync(&to->timer); -} + pos = head + AIO_EVENTS_OFFSET; + page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; + pos %= AIO_EVENTS_PER_PAGE; -static int read_events(struct kioctx *ctx, - long min_nr, long nr, - struct io_event __user *event, - struct timespec __user *timeout) -{ - long start_jiffies = jiffies; - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - int ret; - int i = 0; - struct io_event ent; - struct aio_timeout to; - int retry = 0; - - /* needed to zero any padding within an entry (there shouldn't be - * any, but C is fun! - */ - memset(&ent, 0, sizeof(ent)); -retry: - ret = 0; - while (likely(i < nr)) { - ret = aio_read_evt(ctx, &ent); - if (unlikely(ret <= 0)) - break; + ev = kmap(page); + copy_ret = copy_to_user(event + ret, ev + pos, + sizeof(*ev) * avail); + kunmap(page); - dprintk("read event: %Lx %Lx %Lx %Lx\n", - ent.data, ent.obj, ent.res, ent.res2); - - /* Could we split the check in two? */ - ret = -EFAULT; - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { - dprintk("aio: lost an event due to EFAULT.\n"); - break; + if (unlikely(copy_ret)) { + ret = -EFAULT; + goto out; } - ret = 0; - /* Good, event copied to userland, update counts. */ - event ++; - i ++; + ret += avail; + head += avail; + head %= ctx->nr_events; } - if (min_nr <= i) - return i; - if (ret) - return ret; + ring = kmap_atomic(ctx->ring_pages[0]); + ring->head = head; + kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); - /* End fast path */ + pr_debug("%li h%u t%u\n", ret, head, ctx->tail); - /* racey check, but it gets redone */ - if (!retry && unlikely(!list_empty(&ctx->run_list))) { - retry = 1; - aio_run_all_iocbs(ctx); - goto retry; - } + atomic_sub(ret, &ctx->reqs_active); +out: + mutex_unlock(&ctx->ring_lock); - init_timeout(&to); - if (timeout) { - struct timespec ts; - ret = -EFAULT; - if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) - goto out; + return ret; +} - set_timeout(start_jiffies, &to, &ts); - } +static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, + struct io_event __user *event, long *i) +{ + long ret = aio_read_events_ring(ctx, event + *i, nr - *i); - while (likely(i < nr)) { - add_wait_queue_exclusive(&ctx->wait, &wait); - do { - set_task_state(tsk, TASK_INTERRUPTIBLE); - ret = aio_read_evt(ctx, &ent); - if (ret) - break; - if (min_nr <= i) - break; - if (unlikely(ctx->dead)) { - ret = -EINVAL; - break; - } - if (to.timed_out) /* Only check after read evt */ - break; - /* Try to only show up in io wait if there are ops - * in flight */ - if (ctx->reqs_active) - io_schedule(); - else - schedule(); - if (signal_pending(tsk)) { - ret = -EINTR; - break; - } - /*ret = aio_read_evt(ctx, &ent);*/ - } while (1) ; - - set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(&ctx->wait, &wait); - - if (unlikely(ret <= 0)) - break; + if (ret > 0) + *i += ret; - ret = -EFAULT; - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { - dprintk("aio: lost an event due to EFAULT.\n"); - break; - } + if (unlikely(atomic_read(&ctx->dead))) + ret = -EINVAL; - /* Good, event copied to userland, update counts. */ - event ++; - i ++; - } + if (!*i) + *i = ret; - if (timeout) - clear_timeout(&to); -out: - destroy_timer_on_stack(&to.timer); - return i ? i : ret; + return ret < 0 || *i >= min_nr; } -/* Take an ioctx and remove it from the list of ioctx's. Protects - * against races with itself via ->dead. - */ -static void io_destroy(struct kioctx *ioctx) +static long read_events(struct kioctx *ctx, long min_nr, long nr, + struct io_event __user *event, + struct timespec __user *timeout) { - struct mm_struct *mm = current->mm; - int was_dead; + ktime_t until = { .tv64 = KTIME_MAX }; + long ret = 0; - /* delete the entry from the list is someone else hasn't already */ - spin_lock(&mm->ioctx_lock); - was_dead = ioctx->dead; - ioctx->dead = 1; - hlist_del_rcu(&ioctx->list); - spin_unlock(&mm->ioctx_lock); + if (timeout) { + struct timespec ts; - dprintk("aio_release(%p)\n", ioctx); - if (likely(!was_dead)) - put_ioctx(ioctx); /* twice for the list */ + if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) + return -EFAULT; - kill_ctx(ioctx); + until = timespec_to_ktime(ts); + } /* - * Wake up any waiters. The setting of ctx->dead must be seen - * by other CPUs at this point. Right now, we rely on the - * locking done by the above calls to ensure this consistency. + * Note that aio_read_events() is being called as the conditional - i.e. + * we're calling it after prepare_to_wait() has set task state to + * TASK_INTERRUPTIBLE. + * + * But aio_read_events() can block, and if it blocks it's going to flip + * the task state back to TASK_RUNNING. + * + * This should be ok, provided it doesn't flip the state back to + * TASK_RUNNING and return 0 too much - that causes us to spin. That + * will only happen if the mutex_lock() call blocks, and we then find + * the ringbuffer empty. So in practice we should be ok, but it's + * something to be aware of when touching this code. */ - wake_up_all(&ioctx->wait); + wait_event_interruptible_hrtimeout(ctx->wait, + aio_read_events(ctx, min_nr, nr, event, &ret), until); + + if (!ret && signal_pending(current)) + ret = -EINTR; + + return ret; } /* sys_io_setup: @@ -1252,7 +854,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - io_destroy(ioctx); + kill_ioctx(ioctx); put_ioctx(ioctx); } @@ -1270,7 +872,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - io_destroy(ioctx); + kill_ioctx(ioctx); put_ioctx(ioctx); return 0; } @@ -1301,29 +903,22 @@ static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) BUG_ON(ret > 0 && iocb->ki_left == 0); } -static ssize_t aio_rw_vect_retry(struct kiocb *iocb) +typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, + unsigned long, loff_t); + +static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - ssize_t (*rw_op)(struct kiocb *, const struct iovec *, - unsigned long, loff_t); ssize_t ret = 0; - unsigned short opcode; - - if ((iocb->ki_opcode == IOCB_CMD_PREADV) || - (iocb->ki_opcode == IOCB_CMD_PREAD)) { - rw_op = file->f_op->aio_read; - opcode = IOCB_CMD_PREADV; - } else { - rw_op = file->f_op->aio_write; - opcode = IOCB_CMD_PWRITEV; - } /* This matches the pread()/pwrite() logic */ if (iocb->ki_pos < 0) return -EINVAL; + if (rw == WRITE) + file_start_write(file); do { ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], iocb->ki_nr_segs - iocb->ki_cur_seg, @@ -1334,8 +929,10 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) /* retry all partial writes. retry partial reads as long as its a * regular file. */ } while (ret > 0 && iocb->ki_left > 0 && - (opcode == IOCB_CMD_PWRITEV || + (rw == WRITE || (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); + if (rw == WRITE) + file_end_write(file); /* This means we must have transferred all that we could */ /* No need to retry anymore */ @@ -1344,81 +941,49 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) /* If we managed to write some out we return that, rather than * the eventual error. */ - if (opcode == IOCB_CMD_PWRITEV - && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY + if (rw == WRITE + && ret < 0 && ret != -EIOCBQUEUED && iocb->ki_nbytes - iocb->ki_left) ret = iocb->ki_nbytes - iocb->ki_left; return ret; } -static ssize_t aio_fdsync(struct kiocb *iocb) -{ - struct file *file = iocb->ki_filp; - ssize_t ret = -EINVAL; - - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(iocb, 1); - return ret; -} - -static ssize_t aio_fsync(struct kiocb *iocb) -{ - struct file *file = iocb->ki_filp; - ssize_t ret = -EINVAL; - - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(iocb, 0); - return ret; -} - -static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) +static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) { ssize_t ret; + kiocb->ki_nr_segs = kiocb->ki_nbytes; + #ifdef CONFIG_COMPAT if (compat) - ret = compat_rw_copy_check_uvector(type, + ret = compat_rw_copy_check_uvector(rw, (struct compat_iovec __user *)kiocb->ki_buf, - kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, &kiocb->ki_iovec); else #endif - ret = rw_copy_check_uvector(type, + ret = rw_copy_check_uvector(rw, (struct iovec __user *)kiocb->ki_buf, - kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, &kiocb->ki_iovec); if (ret < 0) - goto out; - - ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret); - if (ret < 0) - goto out; + return ret; - kiocb->ki_nr_segs = kiocb->ki_nbytes; - kiocb->ki_cur_seg = 0; - /* ki_nbytes/left now reflect bytes instead of segs */ + /* ki_nbytes now reflect bytes instead of segs */ kiocb->ki_nbytes = ret; - kiocb->ki_left = ret; - - ret = 0; -out: - return ret; + return 0; } -static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb) +static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) { - int bytes; - - bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left); - if (bytes < 0) - return bytes; + if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) + return -EFAULT; kiocb->ki_iovec = &kiocb->ki_inline_vec; kiocb->ki_iovec->iov_base = kiocb->ki_buf; - kiocb->ki_iovec->iov_len = bytes; + kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; kiocb->ki_nr_segs = 1; - kiocb->ki_cur_seg = 0; return 0; } @@ -1427,96 +992,95 @@ static ssize_t aio_setup_single_vector(int type, struct file * file, struct kioc * Performs the initial checks and aio retry method * setup for the kiocb at the time of io submission. */ -static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) +static ssize_t aio_run_iocb(struct kiocb *req, bool compat) { - struct file *file = kiocb->ki_filp; - ssize_t ret = 0; + struct file *file = req->ki_filp; + ssize_t ret; + int rw; + fmode_t mode; + aio_rw_op *rw_op; - switch (kiocb->ki_opcode) { + switch (req->ki_opcode) { case IOCB_CMD_PREAD: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_READ))) - break; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, - kiocb->ki_left))) - break; - ret = aio_setup_single_vector(READ, file, kiocb); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_read) - kiocb->ki_retry = aio_rw_vect_retry; - break; - case IOCB_CMD_PWRITE: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - break; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, - kiocb->ki_left))) - break; - ret = aio_setup_single_vector(WRITE, file, kiocb); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_write) - kiocb->ki_retry = aio_rw_vect_retry; - break; case IOCB_CMD_PREADV: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_READ))) - break; - ret = aio_setup_vectored_rw(READ, kiocb, compat); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_read) - kiocb->ki_retry = aio_rw_vect_retry; - break; + mode = FMODE_READ; + rw = READ; + rw_op = file->f_op->aio_read; + goto rw_common; + + case IOCB_CMD_PWRITE: case IOCB_CMD_PWRITEV: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - break; - ret = aio_setup_vectored_rw(WRITE, kiocb, compat); + mode = FMODE_WRITE; + rw = WRITE; + rw_op = file->f_op->aio_write; + goto rw_common; +rw_common: + if (unlikely(!(file->f_mode & mode))) + return -EBADF; + + if (!rw_op) + return -EINVAL; + + ret = (req->ki_opcode == IOCB_CMD_PREADV || + req->ki_opcode == IOCB_CMD_PWRITEV) + ? aio_setup_vectored_rw(rw, req, compat) + : aio_setup_single_vector(rw, req); if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_write) - kiocb->ki_retry = aio_rw_vect_retry; + return ret; + + ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + if (ret < 0) + return ret; + + req->ki_nbytes = ret; + req->ki_left = ret; + + ret = aio_rw_vect_retry(req, rw, rw_op); break; + case IOCB_CMD_FDSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - kiocb->ki_retry = aio_fdsync; + if (!file->f_op->aio_fsync) + return -EINVAL; + + ret = file->f_op->aio_fsync(req, 1); break; + case IOCB_CMD_FSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - kiocb->ki_retry = aio_fsync; + if (!file->f_op->aio_fsync) + return -EINVAL; + + ret = file->f_op->aio_fsync(req, 0); break; + default: - dprintk("EINVAL: io_submit: no operation provided\n"); - ret = -EINVAL; + pr_debug("EINVAL: no operation provided\n"); + return -EINVAL; } - if (!kiocb->ki_retry) - return ret; + if (ret != -EIOCBQUEUED) { + /* + * There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || + ret == -ERESTARTNOHAND || + ret == -ERESTART_RESTARTBLOCK)) + ret = -EINTR; + aio_complete(req, ret, 0); + } return 0; } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, struct kiocb_batch *batch, - bool compat) + struct iocb *iocb, bool compat) { struct kiocb *req; - struct file *file; ssize_t ret; /* enforce forwards compatibility on users */ if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { - pr_debug("EINVAL: io_submit: reserve field set\n"); + pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } @@ -1530,16 +1094,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return -EINVAL; } - file = fget(iocb->aio_fildes); - if (unlikely(!file)) - return -EBADF; - - req = aio_get_req(ctx, batch); /* returns with 2 references to req */ - if (unlikely(!req)) { - fput(file); + req = aio_get_req(ctx); + if (unlikely(!req)) return -EAGAIN; + + req->ki_filp = fget(iocb->aio_fildes); + if (unlikely(!req->ki_filp)) { + ret = -EBADF; + goto out_put_req; } - req->ki_filp = file; + if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an @@ -1555,9 +1119,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, } } - ret = put_user(req->ki_key, &user_iocb->aio_key); + ret = put_user(KIOCB_KEY, &user_iocb->aio_key); if (unlikely(ret)) { - dprintk("EFAULT: aio_key\n"); + pr_debug("EFAULT: aio_key\n"); goto out_put_req; } @@ -1569,41 +1133,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_left = req->ki_nbytes = iocb->aio_nbytes; req->ki_opcode = iocb->aio_lio_opcode; - ret = aio_setup_iocb(req, compat); - + ret = aio_run_iocb(req, compat); if (ret) goto out_put_req; - spin_lock_irq(&ctx->ctx_lock); - /* - * We could have raced with io_destroy() and are currently holding a - * reference to ctx which should be destroyed. We cannot submit IO - * since ctx gets freed as soon as io_submit() puts its reference. The - * check here is reliable: io_destroy() sets ctx->dead before waiting - * for outstanding IO and the barrier between these two is realized by - * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we - * increment ctx->reqs_active before checking for ctx->dead and the - * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we - * don't see ctx->dead set here, io_destroy() waits for our IO to - * finish. - */ - if (ctx->dead) { - spin_unlock_irq(&ctx->ctx_lock); - ret = -EINVAL; - goto out_put_req; - } - aio_run_iocb(req); - if (!list_empty(&ctx->run_list)) { - /* drain the run list */ - while (__aio_run_iocbs(ctx)) - ; - } - spin_unlock_irq(&ctx->ctx_lock); - aio_put_req(req); /* drop extra ref to req */ return 0; - out_put_req: + atomic_dec(&ctx->reqs_active); aio_put_req(req); /* drop extra ref to req */ aio_put_req(req); /* drop i/o ref to req */ return ret; @@ -1616,7 +1153,6 @@ long do_io_submit(aio_context_t ctx_id, long nr, long ret = 0; int i = 0; struct blk_plug plug; - struct kiocb_batch batch; if (unlikely(nr < 0)) return -EINVAL; @@ -1629,12 +1165,10 @@ long do_io_submit(aio_context_t ctx_id, long nr, ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { - pr_debug("EINVAL: io_submit: invalid context id\n"); + pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } - kiocb_batch_init(&batch, nr); - blk_start_plug(&plug); /* @@ -1655,13 +1189,12 @@ long do_io_submit(aio_context_t ctx_id, long nr, break; } - ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat); + ret = io_submit_one(ctx, user_iocb, &tmp, compat); if (ret) break; } blk_finish_plug(&plug); - kiocb_batch_free(ctx, &batch); put_ioctx(ctx); return i ? i : ret; } @@ -1694,10 +1227,13 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, assert_spin_locked(&ctx->ctx_lock); + if (key != KIOCB_KEY) + return NULL; + /* TODO: use a hash or array, this sucks. */ list_for_each(pos, &ctx->active_reqs) { struct kiocb *kiocb = list_kiocb(pos); - if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key) + if (kiocb->ki_obj.user == iocb) return kiocb; } return NULL; @@ -1716,7 +1252,7 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { - int (*cancel)(struct kiocb *iocb, struct io_event *res); + struct io_event res; struct kioctx *ctx; struct kiocb *kiocb; u32 key; @@ -1731,32 +1267,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - ret = -EAGAIN; + kiocb = lookup_kiocb(ctx, iocb, key); - if (kiocb && kiocb->ki_cancel) { - cancel = kiocb->ki_cancel; - kiocb->ki_users ++; - kiocbSetCancelled(kiocb); - } else - cancel = NULL; + if (kiocb) + ret = kiocb_cancel(ctx, kiocb, &res); + else + ret = -EINVAL; + spin_unlock_irq(&ctx->ctx_lock); - if (NULL != cancel) { - struct io_event tmp; - pr_debug("calling cancel\n"); - memset(&tmp, 0, sizeof(tmp)); - tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; - tmp.data = kiocb->ki_user_data; - ret = cancel(kiocb, &tmp); - if (!ret) { - /* Cancellation succeeded -- copy the result - * into the user's buffer. - */ - if (copy_to_user(result, &tmp, sizeof(tmp))) - ret = -EFAULT; - } - } else - ret = -EINVAL; + if (!ret) { + /* Cancellation succeeded -- copy the result + * into the user's buffer. + */ + if (copy_to_user(result, &res, sizeof(res))) + ret = -EFAULT; + } put_ioctx(ctx); @@ -1773,8 +1299,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, * < min_nr if the timeout specified by timeout has elapsed * before sufficient events are available, where timeout == NULL * specifies an infinite timeout. Note that the timeout pointed to by - * timeout is relative and will be updated if not NULL and the - * operation blocks. Will fail with -ENOSYS if not implemented. + * timeout is relative. Will fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long, min_nr, @@ -1790,7 +1315,5 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, ret = read_events(ioctx, min_nr, nr, events, timeout); put_ioctx(ioctx); } - - asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout); return ret; } diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 01443ce43ee7..3d9d3f5d5dda 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -61,15 +61,6 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) /* This is an autofs submount, we can't expire it */ if (autofs_type_indirect(sbi->type)) goto done; - - /* - * Otherwise it's an offset mount and we need to check - * if we can umount its mount, if there is one. - */ - if (!d_mountpoint(path.dentry)) { - status = 0; - goto done; - } } /* Update the expiry counter if fs is busy */ @@ -118,7 +109,7 @@ cont: spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); /* Already gone or negative dentry (under construction) - try next */ - if (q->d_count == 0 || !simple_positive(q)) { + if (!d_count(q) || !simple_positive(q)) { spin_unlock(&q->d_lock); next = q->d_u.d_child.next; goto cont; @@ -276,7 +267,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt, else ino_count++; - if (p->d_count > ino_count) { + if (d_count(p) > ino_count) { top_ino->last_used = jiffies; dput(p); return 1; @@ -418,7 +409,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, if (!exp_leaves) { /* Path walk currently on this dentry? */ ino_count = atomic_read(&ino->count) + 1; - if (dentry->d_count > ino_count) + if (d_count(dentry) > ino_count) goto next; if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { @@ -432,7 +423,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, } else { /* Path walk currently on this dentry? */ ino_count = atomic_read(&ino->count) + 1; - if (dentry->d_count > ino_count) + if (d_count(dentry) > ino_count) goto next; expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 9bd16255dd9c..92ef341ba0cf 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -41,7 +41,7 @@ const struct file_operations autofs4_root_operations = { .open = dcache_dir_open, .release = dcache_dir_close, .read = generic_read_dir, - .readdir = dcache_readdir, + .iterate = dcache_readdir, .llseek = dcache_dir_lseek, .unlocked_ioctl = autofs4_root_ioctl, #ifdef CONFIG_COMPAT @@ -53,7 +53,7 @@ const struct file_operations autofs4_dir_operations = { .open = autofs4_dir_open, .release = dcache_dir_close, .read = generic_read_dir, - .readdir = dcache_readdir, + .iterate = dcache_readdir, .llseek = dcache_dir_lseek, }; @@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) spin_lock(&active->d_lock); /* Already gone? */ - if (active->d_count == 0) + if (!d_count(active)) goto next; qstr = &active->d_name; @@ -408,7 +408,7 @@ done: return NULL; } -int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) +static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 922ad460bff9..7c93953030fb 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -45,7 +45,7 @@ static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return -EIO; } -static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int bad_file_readdir(struct file *file, struct dir_context *ctx) { return -EIO; } @@ -152,7 +152,7 @@ static const struct file_operations bad_file_ops = .write = bad_file_write, .aio_read = bad_file_aio_read, .aio_write = bad_file_aio_write, - .readdir = bad_file_readdir, + .iterate = bad_file_readdir, .poll = bad_file_poll, .unlocked_ioctl = bad_file_unlocked_ioctl, .compat_ioctl = bad_file_compat_ioctl, diff --git a/fs/befs/btree.c b/fs/befs/btree.c index a66c9b1136e0..74e397db0b8b 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -436,8 +436,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, goto error; } - if ((this_node = (befs_btree_node *) - kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) { + if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) { befs_error(sb, "befs_btree_read() failed to allocate %u " "bytes of memory", sizeof (befs_btree_node)); goto error; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 8615ee89ab55..e9c75e20db32 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -31,7 +31,7 @@ MODULE_LICENSE("GPL"); /* The units the vfs expects inode->i_blocks to be in */ #define VFS_BLOCK_SIZE 512 -static int befs_readdir(struct file *, void *, filldir_t); +static int befs_readdir(struct file *, struct dir_context *); static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int); static int befs_readpage(struct file *file, struct page *page); static sector_t befs_bmap(struct address_space *mapping, sector_t block); @@ -66,7 +66,7 @@ static struct kmem_cache *befs_inode_cachep; static const struct file_operations befs_dir_operations = { .read = generic_read_dir, - .readdir = befs_readdir, + .iterate = befs_readdir, .llseek = generic_file_llseek, }; @@ -211,9 +211,9 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) } static int -befs_readdir(struct file *filp, void *dirent, filldir_t filldir) +befs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; befs_data_stream *ds = &BEFS_I(inode)->i_data.ds; befs_off_t value; @@ -221,15 +221,14 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir) size_t keysize; unsigned char d_type; char keybuf[BEFS_NAME_LEN + 1]; - char *nlsname; - int nlsnamelen; - const char *dirname = filp->f_path.dentry->d_name.name; + const char *dirname = file->f_path.dentry->d_name.name; befs_debug(sb, "---> befs_readdir() " - "name %s, inode %ld, filp->f_pos %Ld", - dirname, inode->i_ino, filp->f_pos); + "name %s, inode %ld, ctx->pos %Ld", + dirname, inode->i_ino, ctx->pos); - result = befs_btree_read(sb, ds, filp->f_pos, BEFS_NAME_LEN + 1, +more: + result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, keybuf, &keysize, &value); if (result == BEFS_ERR) { @@ -251,24 +250,29 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir) /* Convert to NLS */ if (BEFS_SB(sb)->nls) { + char *nlsname; + int nlsnamelen; result = befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen); if (result < 0) { befs_debug(sb, "<--- befs_readdir() ERROR"); return result; } - result = filldir(dirent, nlsname, nlsnamelen, filp->f_pos, - (ino_t) value, d_type); + if (!dir_emit(ctx, nlsname, nlsnamelen, + (ino_t) value, d_type)) { + kfree(nlsname); + return 0; + } kfree(nlsname); - } else { - result = filldir(dirent, keybuf, keysize, filp->f_pos, - (ino_t) value, d_type); + if (!dir_emit(ctx, keybuf, keysize, + (ino_t) value, d_type)) + return 0; } + ctx->pos++; + goto more; - filp->f_pos++; - - befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos); + befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos); return 0; } diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 3f422f6bb5ca..a399e6d9dc74 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -26,58 +26,51 @@ static struct buffer_head *bfs_find_entry(struct inode *dir, const unsigned char *name, int namelen, struct bfs_dirent **res_dir); -static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir) +static int bfs_readdir(struct file *f, struct dir_context *ctx) { struct inode *dir = file_inode(f); struct buffer_head *bh; struct bfs_dirent *de; - struct bfs_sb_info *info = BFS_SB(dir->i_sb); unsigned int offset; int block; - mutex_lock(&info->bfs_lock); - - if (f->f_pos & (BFS_DIRENT_SIZE - 1)) { + if (ctx->pos & (BFS_DIRENT_SIZE - 1)) { printf("Bad f_pos=%08lx for %s:%08lx\n", - (unsigned long)f->f_pos, + (unsigned long)ctx->pos, dir->i_sb->s_id, dir->i_ino); - mutex_unlock(&info->bfs_lock); - return -EBADF; + return -EINVAL; } - while (f->f_pos < dir->i_size) { - offset = f->f_pos & (BFS_BSIZE - 1); - block = BFS_I(dir)->i_sblock + (f->f_pos >> BFS_BSIZE_BITS); + while (ctx->pos < dir->i_size) { + offset = ctx->pos & (BFS_BSIZE - 1); + block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS); bh = sb_bread(dir->i_sb, block); if (!bh) { - f->f_pos += BFS_BSIZE - offset; + ctx->pos += BFS_BSIZE - offset; continue; } do { de = (struct bfs_dirent *)(bh->b_data + offset); if (de->ino) { int size = strnlen(de->name, BFS_NAMELEN); - if (filldir(dirent, de->name, size, f->f_pos, + if (!dir_emit(ctx, de->name, size, le16_to_cpu(de->ino), - DT_UNKNOWN) < 0) { + DT_UNKNOWN)) { brelse(bh); - mutex_unlock(&info->bfs_lock); return 0; } } offset += BFS_DIRENT_SIZE; - f->f_pos += BFS_DIRENT_SIZE; - } while ((offset < BFS_BSIZE) && (f->f_pos < dir->i_size)); + ctx->pos += BFS_DIRENT_SIZE; + } while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size)); brelse(bh); } - - mutex_unlock(&info->bfs_lock); - return 0; + return 0; } const struct file_operations bfs_dir_operations = { .read = generic_read_dir, - .readdir = bfs_readdir, + .iterate = bfs_readdir, .fsync = generic_file_fsync, .llseek = generic_file_llseek, }; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index bbc8f8827eac..89dec7f789a4 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -62,7 +62,6 @@ static int aout_core_dump(struct coredump_params *cprm) fs = get_fs(); set_fs(KERNEL_DS); has_dumped = 1; - current->flags |= PF_DUMPCORE; strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); dump.u_ar0 = offsetof(struct user, regs); dump.signal = cprm->siginfo->si_signo; @@ -256,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm) (current->mm->start_data = N_DATADDR(ex)); current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); - current->mm->free_area_cache = current->mm->mmap_base; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); if (retval < 0) { @@ -287,15 +284,12 @@ static int load_aout_binary(struct linux_binprm * bprm) return error; } - error = bprm->file->f_op->read(bprm->file, - (char __user *)text_addr, - ex.a_text+ex.a_data, &pos); + error = read_code(bprm->file, text_addr, pos, + ex.a_text+ex.a_data); if ((signed long)error < 0) { send_sig(SIGKILL, current, 0); return error; } - - flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data); } else { if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && (N_MAGIC(ex) != NMAGIC) && printk_ratelimit()) @@ -311,14 +305,9 @@ static int load_aout_binary(struct linux_binprm * bprm) } if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { - loff_t pos = fd_offset; vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - bprm->file->f_op->read(bprm->file, - (char __user *)N_TXTADDR(ex), - ex.a_text+ex.a_data, &pos); - flush_icache_range((unsigned long) N_TXTADDR(ex), - (unsigned long) N_TXTADDR(ex) + - ex.a_text+ex.a_data); + read_code(bprm->file, N_TXTADDR(ex), fd_offset, + ex.a_text + ex.a_data); goto beyond_if; } @@ -397,8 +386,6 @@ static int load_aout_library(struct file *file) start_addr = ex.a_entry & 0xfffff000; if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { - loff_t pos = N_TXTOFF(ex); - if (printk_ratelimit()) { printk(KERN_WARNING @@ -407,11 +394,8 @@ static int load_aout_library(struct file *file) } vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - file->f_op->read(file, (char __user *)start_addr, - ex.a_text + ex.a_data, &pos); - flush_icache_range((unsigned long) start_addr, - (unsigned long) start_addr + ex.a_text + ex.a_data); - + read_code(file, start_addr, N_TXTOFF(ex), + ex.a_text + ex.a_data); retval = 0; goto out; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 86af964c2425..100edcc5e312 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -240,6 +240,9 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid)); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); +#ifdef ELF_HWCAP2 + NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); +#endif NEW_AUX_ENT(AT_EXECFN, bprm->exec); if (k_platform) { NEW_AUX_ENT(AT_PLATFORM, @@ -735,8 +738,6 @@ static int load_elf_binary(struct linux_binprm *bprm) /* Do this so that we can load the interpreter, if need be. We will change some of these later */ - current->mm->free_area_cache = current->mm->mmap_base; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), executable_stack); if (retval < 0) { @@ -803,7 +804,8 @@ static int load_elf_binary(struct linux_binprm *bprm) * follow the loader, and is not movable. */ #ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE /* Memory randomization might have been switched off - * in runtime via sysctl. + * in runtime via sysctl or explicit setting of + * personality flags. * If that is the case, retain the original non-zero * load_bias value in order to establish proper * non-randomized mappings. @@ -2091,8 +2093,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto cleanup; has_dumped = 1; - current->flags |= PF_DUMPCORE; - + fs = get_fs(); set_fs(KERNEL_DS); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 9c13e023e2b7..c166f325a183 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -483,7 +483,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, size_t platform_len = 0, len; char *k_platform, *k_base_platform; char __user *u_platform, *u_base_platform, *p; - long hwcap; int loop; int nr; /* reset for each csp adjustment */ @@ -502,8 +501,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, return -EFAULT; #endif - hwcap = ELF_HWCAP; - /* * If this architecture has a platform capability string, copy it * to userspace. In some cases (Sparc), this info is impossible @@ -617,7 +614,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, nr = 0; csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); - NEW_AUX_ENT(AT_HWCAP, hwcap); + NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); +#ifdef ELF_HWCAP2 + NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); +#endif NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); NEW_AUX_ENT(AT_PHDR, exec_params->ph_addr); @@ -926,7 +926,6 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( struct elf32_fdpic_loadseg *seg; struct elf32_phdr *phdr; unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags; - loff_t fpos; int loop, ret; load_addr = params->load_addr; @@ -964,14 +963,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( if (params->phdrs[loop].p_type != PT_LOAD) continue; - fpos = phdr->p_offset; - seg->addr = maddr + (phdr->p_vaddr - base); seg->p_vaddr = phdr->p_vaddr; seg->p_memsz = phdr->p_memsz; - ret = file->f_op->read(file, (void *) seg->addr, - phdr->p_filesz, &fpos); + ret = read_code(file, seg->addr, phdr->p_offset, + phdr->p_filesz); if (ret < 0) return ret; @@ -1687,8 +1684,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) fill_elf_fdpic_header(elf, e_phnum); has_dumped = 1; - current->flags |= PF_DUMPCORE; - /* * Set up the notes in similar form to SVR4 core dumps made * with info from their /proc. diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 2036d21baaef..d50bbe59da1e 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -207,11 +207,12 @@ static int decompress_exec( /* Read in first chunk of data and parse gzip header. */ fpos = offset; - ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); + ret = kernel_read(bprm->file, offset, buf, LBUFSIZE); strm.next_in = buf; strm.avail_in = ret; strm.total_in = 0; + fpos += ret; retval = -ENOEXEC; @@ -277,7 +278,7 @@ static int decompress_exec( } while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK) { - ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); + ret = kernel_read(bprm->file, fpos, buf, LBUFSIZE); if (ret <= 0) break; len -= ret; @@ -285,6 +286,7 @@ static int decompress_exec( strm.next_in = buf; strm.avail_in = ret; strm.total_in = 0; + fpos += ret; } if (ret < 0) { @@ -428,6 +430,7 @@ static int load_flat_file(struct linux_binprm * bprm, unsigned long textpos = 0, datapos = 0, result; unsigned long realdatastart = 0; unsigned long text_len, data_len, bss_len, stack_len, flags; + unsigned long full_data; unsigned long len, memp = 0; unsigned long memp_size, extra, rlim; unsigned long *reloc = 0, *rp; @@ -451,6 +454,7 @@ static int load_flat_file(struct linux_binprm * bprm, relocs = ntohl(hdr->reloc_count); flags = ntohl(hdr->flags); rev = ntohl(hdr->rev); + full_data = data_len + relocs * sizeof(unsigned long); if (strncmp(hdr->magic, "bFLT", 4)) { /* @@ -577,12 +581,12 @@ static int load_flat_file(struct linux_binprm * bprm, #ifdef CONFIG_BINFMT_ZFLAT if (flags & FLAT_FLAG_GZDATA) { result = decompress_exec(bprm, fpos, (char *) datapos, - data_len + (relocs * sizeof(unsigned long)), 0); + full_data, 0); } else #endif { - result = bprm->file->f_op->read(bprm->file, (char *) datapos, - data_len + (relocs * sizeof(unsigned long)), &fpos); + result = read_code(bprm->file, datapos, fpos, + full_data); } if (IS_ERR_VALUE(result)) { printk("Unable to read data+bss, errno %d\n", (int)-result); @@ -627,30 +631,25 @@ static int load_flat_file(struct linux_binprm * bprm, if (flags & FLAT_FLAG_GZIP) { result = decompress_exec(bprm, sizeof (struct flat_hdr), (((char *) textpos) + sizeof (struct flat_hdr)), - (text_len + data_len + (relocs * sizeof(unsigned long)) + (text_len + full_data - sizeof (struct flat_hdr)), 0); memmove((void *) datapos, (void *) realdatastart, - data_len + (relocs * sizeof(unsigned long))); + full_data); } else if (flags & FLAT_FLAG_GZDATA) { - fpos = 0; - result = bprm->file->f_op->read(bprm->file, - (char *) textpos, text_len, &fpos); + result = read_code(bprm->file, textpos, 0, text_len); if (!IS_ERR_VALUE(result)) result = decompress_exec(bprm, text_len, (char *) datapos, - data_len + (relocs * sizeof(unsigned long)), 0); + full_data, 0); } else #endif { - fpos = 0; - result = bprm->file->f_op->read(bprm->file, - (char *) textpos, text_len, &fpos); - if (!IS_ERR_VALUE(result)) { - fpos = ntohl(hdr->data_start); - result = bprm->file->f_op->read(bprm->file, (char *) datapos, - data_len + (relocs * sizeof(unsigned long)), &fpos); - } + result = read_code(bprm->file, textpos, 0, text_len); + if (!IS_ERR_VALUE(result)) + result = read_code(bprm->file, datapos, + ntohl(hdr->data_start), + full_data); } if (IS_ERR_VALUE(result)) { printk("Unable to read code+data+bss, errno %d\n",(int)-result); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 751df5e4f61a..1c740e152f38 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -23,6 +23,7 @@ #include <linux/binfmts.h> #include <linux/slab.h> #include <linux/ctype.h> +#include <linux/string_helpers.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/namei.h> @@ -234,24 +235,6 @@ static char *scanarg(char *s, char del) return s; } -static int unquote(char *from) -{ - char c = 0, *s = from, *p = from; - - while ((c = *s++) != '\0') { - if (c == '\\' && *s == 'x') { - s++; - c = toupper(*s++); - *p = (c - (isdigit(c) ? '0' : 'A' - 10)) << 4; - c = toupper(*s++); - *p++ |= c - (isdigit(c) ? '0' : 'A' - 10); - continue; - } - *p++ = c; - } - return p - from; -} - static char * check_special_flags (char * sfs, Node * e) { char * p = sfs; @@ -354,8 +337,9 @@ static Node *create_entry(const char __user *buffer, size_t count) p[-1] = '\0'; if (!e->mask[0]) e->mask = NULL; - e->size = unquote(e->magic); - if (e->mask && unquote(e->mask) != e->size) + e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX); + if (e->mask && + string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) goto Einval; if (e->size + e->offset > BINPRM_BUF_SIZE) goto Einval; diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index a3f28f331b2b..8fb42916d8a2 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -27,48 +27,11 @@ #include <linux/workqueue.h> #include <linux/slab.h> -struct integrity_slab { - struct kmem_cache *slab; - unsigned short nr_vecs; - char name[8]; -}; - -#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) } -struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = { - IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES), -}; -#undef IS +#define BIP_INLINE_VECS 4 +static struct kmem_cache *bip_slab; static struct workqueue_struct *kintegrityd_wq; -static inline unsigned int vecs_to_idx(unsigned int nr) -{ - switch (nr) { - case 1: - return 0; - case 2 ... 4: - return 1; - case 5 ... 16: - return 2; - case 17 ... 64: - return 3; - case 65 ... 128: - return 4; - case 129 ... BIO_MAX_PAGES: - return 5; - default: - BUG(); - } -} - -static inline int use_bip_pool(unsigned int idx) -{ - if (idx == BIOVEC_MAX_IDX) - return 1; - - return 0; -} - /** * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to @@ -84,37 +47,41 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, unsigned int nr_vecs) { struct bio_integrity_payload *bip; - unsigned int idx = vecs_to_idx(nr_vecs); struct bio_set *bs = bio->bi_pool; - - if (!bs) - bs = fs_bio_set; - - BUG_ON(bio == NULL); - bip = NULL; - - /* Lower order allocations come straight from slab */ - if (!use_bip_pool(idx)) - bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask); - - /* Use mempool if lower order alloc failed or max vecs were requested */ - if (bip == NULL) { - idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */ + unsigned long idx = BIO_POOL_NONE; + unsigned inline_vecs; + + if (!bs) { + bip = kmalloc(sizeof(struct bio_integrity_payload) + + sizeof(struct bio_vec) * nr_vecs, gfp_mask); + inline_vecs = nr_vecs; + } else { bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); - - if (unlikely(bip == NULL)) { - printk(KERN_ERR "%s: could not alloc bip\n", __func__); - return NULL; - } + inline_vecs = BIP_INLINE_VECS; } + if (unlikely(!bip)) + return NULL; + memset(bip, 0, sizeof(*bip)); + if (nr_vecs > inline_vecs) { + bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, + bs->bvec_integrity_pool); + if (!bip->bip_vec) + goto err; + } else { + bip->bip_vec = bip->bip_inline_vecs; + } + bip->bip_slab = idx; bip->bip_bio = bio; bio->bi_integrity = bip; return bip; +err: + mempool_free(bip, bs->bio_integrity_pool); + return NULL; } EXPORT_SYMBOL(bio_integrity_alloc); @@ -130,20 +97,18 @@ void bio_integrity_free(struct bio *bio) struct bio_integrity_payload *bip = bio->bi_integrity; struct bio_set *bs = bio->bi_pool; - if (!bs) - bs = fs_bio_set; - - BUG_ON(bip == NULL); - - /* A cloned bio doesn't own the integrity metadata */ - if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY) - && bip->bip_buf != NULL) + if (bip->bip_owns_buf) kfree(bip->bip_buf); - if (use_bip_pool(bip->bip_slab)) + if (bs) { + if (bip->bip_slab != BIO_POOL_NONE) + bvec_free(bs->bvec_integrity_pool, bip->bip_vec, + bip->bip_slab); + mempool_free(bip, bs->bio_integrity_pool); - else - kmem_cache_free(bip_slab[bip->bip_slab].slab, bip); + } else { + kfree(bip); + } bio->bi_integrity = NULL; } @@ -419,6 +384,7 @@ int bio_integrity_prep(struct bio *bio) return -EIO; } + bip->bip_owns_buf = 1; bip->bip_buf = buf; bip->bip_size = len; bip->bip_sector = bio->bi_sector; @@ -694,11 +660,11 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) bp->bio1.bi_integrity = &bp->bip1; bp->bio2.bi_integrity = &bp->bip2; - bp->iv1 = bip->bip_vec[0]; - bp->iv2 = bip->bip_vec[0]; + bp->iv1 = bip->bip_vec[bip->bip_idx]; + bp->iv2 = bip->bip_vec[bip->bip_idx]; - bp->bip1.bip_vec[0] = bp->iv1; - bp->bip2.bip_vec[0] = bp->iv2; + bp->bip1.bip_vec = &bp->iv1; + bp->bip2.bip_vec = &bp->iv2; bp->iv1.bv_len = sectors * bi->tuple_size; bp->iv2.bv_offset += sectors * bi->tuple_size; @@ -746,13 +712,14 @@ EXPORT_SYMBOL(bio_integrity_clone); int bioset_integrity_create(struct bio_set *bs, int pool_size) { - unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES); - if (bs->bio_integrity_pool) return 0; - bs->bio_integrity_pool = - mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab); + bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); + + bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); + if (!bs->bvec_integrity_pool) + return -1; if (!bs->bio_integrity_pool) return -1; @@ -765,13 +732,14 @@ void bioset_integrity_free(struct bio_set *bs) { if (bs->bio_integrity_pool) mempool_destroy(bs->bio_integrity_pool); + + if (bs->bvec_integrity_pool) + mempool_destroy(bs->bio_integrity_pool); } EXPORT_SYMBOL(bioset_integrity_free); void __init bio_integrity_init(void) { - unsigned int i; - /* * kintegrityd won't block much but may burn a lot of CPU cycles. * Make it highpri CPU intensive wq with max concurrency of 1. @@ -781,14 +749,10 @@ void __init bio_integrity_init(void) if (!kintegrityd_wq) panic("Failed to create kintegrityd\n"); - for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) { - unsigned int size; - - size = sizeof(struct bio_integrity_payload) - + bip_slab[i].nr_vecs * sizeof(struct bio_vec); - - bip_slab[i].slab = - kmem_cache_create(bip_slab[i].name, size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - } + bip_slab = kmem_cache_create("bio_integrity_payload", + sizeof(struct bio_integrity_payload) + + sizeof(struct bio_vec) * BIP_INLINE_VECS, + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + if (!bip_slab) + panic("Failed to create slab\n"); } @@ -19,6 +19,7 @@ #include <linux/swap.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/uio.h> #include <linux/iocontext.h> #include <linux/slab.h> #include <linux/init.h> @@ -160,12 +161,12 @@ unsigned int bvec_nr_vecs(unsigned short idx) return bvec_slabs[idx].nr_vecs; } -void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx) +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) { BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); if (idx == BIOVEC_MAX_IDX) - mempool_free(bv, bs->bvec_pool); + mempool_free(bv, pool); else { struct biovec_slab *bvs = bvec_slabs + idx; @@ -173,8 +174,8 @@ void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx) } } -struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, - struct bio_set *bs) +struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, + mempool_t *pool) { struct bio_vec *bvl; @@ -210,7 +211,7 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, */ if (*idx == BIOVEC_MAX_IDX) { fallback: - bvl = mempool_alloc(bs->bvec_pool, gfp_mask); + bvl = mempool_alloc(pool, gfp_mask); } else { struct biovec_slab *bvs = bvec_slabs + *idx; gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); @@ -252,8 +253,8 @@ static void bio_free(struct bio *bio) __bio_free(bio); if (bs) { - if (bio_has_allocated_vec(bio)) - bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); + if (bio_flagged(bio, BIO_OWNS_VEC)) + bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio)); /* * If we have front padding, adjust the bio pointer before freeing @@ -297,6 +298,54 @@ void bio_reset(struct bio *bio) } EXPORT_SYMBOL(bio_reset); +static void bio_alloc_rescue(struct work_struct *work) +{ + struct bio_set *bs = container_of(work, struct bio_set, rescue_work); + struct bio *bio; + + while (1) { + spin_lock(&bs->rescue_lock); + bio = bio_list_pop(&bs->rescue_list); + spin_unlock(&bs->rescue_lock); + + if (!bio) + break; + + generic_make_request(bio); + } +} + +static void punt_bios_to_rescuer(struct bio_set *bs) +{ + struct bio_list punt, nopunt; + struct bio *bio; + + /* + * In order to guarantee forward progress we must punt only bios that + * were allocated from this bio_set; otherwise, if there was a bio on + * there for a stacking driver higher up in the stack, processing it + * could require allocating bios from this bio_set, and doing that from + * our own rescuer would be bad. + * + * Since bio lists are singly linked, pop them all instead of trying to + * remove from the middle of the list: + */ + + bio_list_init(&punt); + bio_list_init(&nopunt); + + while ((bio = bio_list_pop(current->bio_list))) + bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); + + *current->bio_list = nopunt; + + spin_lock(&bs->rescue_lock); + bio_list_merge(&bs->rescue_list, &punt); + spin_unlock(&bs->rescue_lock); + + queue_work(bs->rescue_workqueue, &bs->rescue_work); +} + /** * bio_alloc_bioset - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator @@ -314,11 +363,27 @@ EXPORT_SYMBOL(bio_reset); * previously allocated bio for IO before attempting to allocate a new one. * Failure to do so can cause deadlocks under memory pressure. * + * Note that when running under generic_make_request() (i.e. any block + * driver), bios are not submitted until after you return - see the code in + * generic_make_request() that converts recursion into iteration, to prevent + * stack overflows. + * + * This would normally mean allocating multiple bios under + * generic_make_request() would be susceptible to deadlocks, but we have + * deadlock avoidance code that resubmits any blocked bios from a rescuer + * thread. + * + * However, we do not guarantee forward progress for allocations from other + * mempools. Doing multiple allocations from the same mempool under + * generic_make_request() should be avoided - instead, use bio_set's front_pad + * for per bio allocations. + * * RETURNS: * Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { + gfp_t saved_gfp = gfp_mask; unsigned front_pad; unsigned inline_vecs; unsigned long idx = BIO_POOL_NONE; @@ -336,7 +401,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) front_pad = 0; inline_vecs = nr_iovecs; } else { + /* + * generic_make_request() converts recursion to iteration; this + * means if we're running beneath it, any bios we allocate and + * submit will not be submitted (and thus freed) until after we + * return. + * + * This exposes us to a potential deadlock if we allocate + * multiple bios from the same bio_set() while running + * underneath generic_make_request(). If we were to allocate + * multiple bios (say a stacking block driver that was splitting + * bios), we would deadlock if we exhausted the mempool's + * reserve. + * + * We solve this, and guarantee forward progress, with a rescuer + * workqueue per bio_set. If we go to allocate and there are + * bios on current->bio_list, we first try the allocation + * without __GFP_WAIT; if that fails, we punt those bios we + * would be blocking to the rescuer workqueue before we retry + * with the original gfp_flags. + */ + + if (current->bio_list && !bio_list_empty(current->bio_list)) + gfp_mask &= ~__GFP_WAIT; + p = mempool_alloc(bs->bio_pool, gfp_mask); + if (!p && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + p = mempool_alloc(bs->bio_pool, gfp_mask); + } + front_pad = bs->front_pad; inline_vecs = BIO_INLINE_VECS; } @@ -348,9 +443,17 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) bio_init(bio); if (nr_iovecs > inline_vecs) { - bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); + if (!bvl && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); + } + if (unlikely(!bvl)) goto err_free; + + bio->bi_flags |= 1 << BIO_OWNS_VEC; } else if (nr_iovecs) { bvl = bio->bi_inline_vecs; } @@ -652,6 +755,181 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, } EXPORT_SYMBOL(bio_add_page); +struct submit_bio_ret { + struct completion event; + int error; +}; + +static void submit_bio_wait_endio(struct bio *bio, int error) +{ + struct submit_bio_ret *ret = bio->bi_private; + + ret->error = error; + complete(&ret->event); +} + +/** + * submit_bio_wait - submit a bio, and wait until it completes + * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) + * @bio: The &struct bio which describes the I/O + * + * Simple wrapper around submit_bio(). Returns 0 on success, or the error from + * bio_endio() on failure. + */ +int submit_bio_wait(int rw, struct bio *bio) +{ + struct submit_bio_ret ret; + + rw |= REQ_SYNC; + init_completion(&ret.event); + bio->bi_private = &ret; + bio->bi_end_io = submit_bio_wait_endio; + submit_bio(rw, bio); + wait_for_completion(&ret.event); + + return ret.error; +} +EXPORT_SYMBOL(submit_bio_wait); + +/** + * bio_advance - increment/complete a bio by some number of bytes + * @bio: bio to advance + * @bytes: number of bytes to complete + * + * This updates bi_sector, bi_size and bi_idx; if the number of bytes to + * complete doesn't align with a bvec boundary, then bv_len and bv_offset will + * be updated on the last bvec as well. + * + * @bio will then represent the remaining, uncompleted portion of the io. + */ +void bio_advance(struct bio *bio, unsigned bytes) +{ + if (bio_integrity(bio)) + bio_integrity_advance(bio, bytes); + + bio->bi_sector += bytes >> 9; + bio->bi_size -= bytes; + + if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK) + return; + + while (bytes) { + if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { + WARN_ONCE(1, "bio idx %d >= vcnt %d\n", + bio->bi_idx, bio->bi_vcnt); + break; + } + + if (bytes >= bio_iovec(bio)->bv_len) { + bytes -= bio_iovec(bio)->bv_len; + bio->bi_idx++; + } else { + bio_iovec(bio)->bv_len -= bytes; + bio_iovec(bio)->bv_offset += bytes; + bytes = 0; + } + } +} +EXPORT_SYMBOL(bio_advance); + +/** + * bio_alloc_pages - allocates a single page for each bvec in a bio + * @bio: bio to allocate pages for + * @gfp_mask: flags for allocation + * + * Allocates pages up to @bio->bi_vcnt. + * + * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are + * freed. + */ +int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) +{ + int i; + struct bio_vec *bv; + + bio_for_each_segment_all(bv, bio, i) { + bv->bv_page = alloc_page(gfp_mask); + if (!bv->bv_page) { + while (--bv >= bio->bi_io_vec) + __free_page(bv->bv_page); + return -ENOMEM; + } + } + + return 0; +} +EXPORT_SYMBOL(bio_alloc_pages); + +/** + * bio_copy_data - copy contents of data buffers from one chain of bios to + * another + * @src: source bio list + * @dst: destination bio list + * + * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats + * @src and @dst as linked lists of bios. + * + * Stops when it reaches the end of either @src or @dst - that is, copies + * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). + */ +void bio_copy_data(struct bio *dst, struct bio *src) +{ + struct bio_vec *src_bv, *dst_bv; + unsigned src_offset, dst_offset, bytes; + void *src_p, *dst_p; + + src_bv = bio_iovec(src); + dst_bv = bio_iovec(dst); + + src_offset = src_bv->bv_offset; + dst_offset = dst_bv->bv_offset; + + while (1) { + if (src_offset == src_bv->bv_offset + src_bv->bv_len) { + src_bv++; + if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) { + src = src->bi_next; + if (!src) + break; + + src_bv = bio_iovec(src); + } + + src_offset = src_bv->bv_offset; + } + + if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) { + dst_bv++; + if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) { + dst = dst->bi_next; + if (!dst) + break; + + dst_bv = bio_iovec(dst); + } + + dst_offset = dst_bv->bv_offset; + } + + bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset, + src_bv->bv_offset + src_bv->bv_len - src_offset); + + src_p = kmap_atomic(src_bv->bv_page); + dst_p = kmap_atomic(dst_bv->bv_page); + + memcpy(dst_p + dst_bv->bv_offset, + src_p + src_bv->bv_offset, + bytes); + + kunmap_atomic(dst_p); + kunmap_atomic(src_p); + + src_offset += bytes; + dst_offset += bytes; + } +} +EXPORT_SYMBOL(bio_copy_data); + struct bio_map_data { struct bio_vec *iovecs; struct sg_iovec *sgvecs; @@ -714,7 +992,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, int iov_idx = 0; unsigned int iov_off = 0; - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { char *bv_addr = page_address(bvec->bv_page); unsigned int bv_len = iovecs[i].bv_len; @@ -896,7 +1174,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, return bio; cleanup: if (!map_data) - bio_for_each_segment(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i) __free_page(bvec->bv_page); bio_put(bio); @@ -1110,7 +1388,7 @@ static void __bio_unmap_user(struct bio *bio) /* * make sure we dirty pages we wrote to */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { if (bio_data_dir(bio) == READ) set_page_dirty_lock(bvec->bv_page); @@ -1216,7 +1494,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err) int i; char *p = bmd->sgvecs[0].iov_base; - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { char *addr = page_address(bvec->bv_page); int len = bmd->iovecs[i].bv_len; @@ -1256,7 +1534,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, if (!reading) { void *p = data; - bio_for_each_segment(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i) { char *addr = page_address(bvec->bv_page); memcpy(addr, p, bvec->bv_len); @@ -1301,11 +1579,11 @@ EXPORT_SYMBOL(bio_copy_kern); */ void bio_set_pages_dirty(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; int i; - for (i = 0; i < bio->bi_vcnt; i++) { - struct page *page = bvec[i].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (page && !PageCompound(page)) set_page_dirty_lock(page); @@ -1314,11 +1592,11 @@ void bio_set_pages_dirty(struct bio *bio) static void bio_release_pages(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; int i; - for (i = 0; i < bio->bi_vcnt; i++) { - struct page *page = bvec[i].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (page) put_page(page); @@ -1367,16 +1645,16 @@ static void bio_dirty_fn(struct work_struct *work) void bio_check_pages_dirty(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; int nr_clean_pages = 0; int i; - for (i = 0; i < bio->bi_vcnt; i++) { - struct page *page = bvec[i].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (PageDirty(page) || PageCompound(page)) { page_cache_release(page); - bvec[i].bv_page = NULL; + bvec->bv_page = NULL; } else { nr_clean_pages++; } @@ -1477,8 +1755,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) trace_block_split(bdev_get_queue(bi->bi_bdev), bi, bi->bi_sector + first_sectors); - BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0); - BUG_ON(bi->bi_idx != 0); + BUG_ON(bio_segments(bi) > 1); atomic_set(&bp->cnt, 3); bp->error = 0; bp->bio1 = *bi; @@ -1488,8 +1765,8 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) bp->bio1.bi_size = first_sectors << 9; if (bi->bi_vcnt != 0) { - bp->bv1 = bi->bi_io_vec[0]; - bp->bv2 = bi->bi_io_vec[0]; + bp->bv1 = *bio_iovec(bi); + bp->bv2 = *bio_iovec(bi); if (bio_is_rw(bi)) { bp->bv2.bv_offset += first_sectors << 9; @@ -1541,7 +1818,7 @@ sector_t bio_sector_offset(struct bio *bio, unsigned short index, if (index >= bio->bi_idx) index = bio->bi_vcnt - 1; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { if (i == index) { if (offset > bv->bv_offset) sectors += (offset - bv->bv_offset) / sector_sz; @@ -1559,29 +1836,25 @@ EXPORT_SYMBOL(bio_sector_offset); * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. */ -static int biovec_create_pools(struct bio_set *bs, int pool_entries) +mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries) { struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; - bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab); - if (!bs->bvec_pool) - return -ENOMEM; - - return 0; -} - -static void biovec_free_pools(struct bio_set *bs) -{ - mempool_destroy(bs->bvec_pool); + return mempool_create_slab_pool(pool_entries, bp->slab); } void bioset_free(struct bio_set *bs) { + if (bs->rescue_workqueue) + destroy_workqueue(bs->rescue_workqueue); + if (bs->bio_pool) mempool_destroy(bs->bio_pool); + if (bs->bvec_pool) + mempool_destroy(bs->bvec_pool); + bioset_integrity_free(bs); - biovec_free_pools(bs); bio_put_slab(bs); kfree(bs); @@ -1612,6 +1885,10 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) bs->front_pad = front_pad; + spin_lock_init(&bs->rescue_lock); + bio_list_init(&bs->rescue_list); + INIT_WORK(&bs->rescue_work, bio_alloc_rescue); + bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); if (!bs->bio_slab) { kfree(bs); @@ -1622,9 +1899,15 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) if (!bs->bio_pool) goto bad; - if (!biovec_create_pools(bs, pool_size)) - return bs; + bs->bvec_pool = biovec_create_pool(bs, pool_size); + if (!bs->bvec_pool) + goto bad; + + bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); + if (!bs->rescue_workqueue) + goto bad; + return bs; bad: bioset_free(bs); return NULL; diff --git a/fs/block_dev.c b/fs/block_dev.c index aae187a7f94a..bb43ce081d6e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -27,6 +27,7 @@ #include <linux/namei.h> #include <linux/log2.h> #include <linux/cleancache.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include "internal.h" @@ -324,31 +325,10 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, static loff_t block_llseek(struct file *file, loff_t offset, int whence) { struct inode *bd_inode = file->f_mapping->host; - loff_t size; loff_t retval; mutex_lock(&bd_inode->i_mutex); - size = i_size_read(bd_inode); - - retval = -EINVAL; - switch (whence) { - case SEEK_END: - offset += size; - break; - case SEEK_CUR: - offset += file->f_pos; - case SEEK_SET: - break; - default: - goto out; - } - if (offset >= 0 && offset <= size) { - if (offset != file->f_pos) { - file->f_pos = offset; - } - retval = offset; - } -out: + retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); mutex_unlock(&bd_inode->i_mutex); return retval; } @@ -617,11 +597,9 @@ void bd_forget(struct inode *inode) struct block_device *bdev = NULL; spin_lock(&bdev_lock); - if (inode->i_bdev) { - if (!sb_is_blkdev_sb(inode->i_sb)) - bdev = inode->i_bdev; - __bd_forget(inode); - } + if (!sb_is_blkdev_sb(inode->i_sb)) + bdev = inode->i_bdev; + __bd_forget(inode); spin_unlock(&bdev_lock); if (bdev) @@ -1047,7 +1025,7 @@ void bd_set_size(struct block_device *bdev, loff_t size) } EXPORT_SYMBOL(bd_set_size); -static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); +static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); /* * bd_mutex locking: @@ -1402,9 +1380,8 @@ static int blkdev_open(struct inode * inode, struct file * filp) return blkdev_get(bdev, filp->f_mode, filp); } -static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) +static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) { - int ret = 0; struct gendisk *disk = bdev->bd_disk; struct block_device *victim = NULL; @@ -1424,7 +1401,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) } if (bdev->bd_contains == bdev) { if (disk->fops->release) - ret = disk->fops->release(disk, mode); + disk->fops->release(disk, mode); } if (!bdev->bd_openers) { struct module *owner = disk->fops->owner; @@ -1443,10 +1420,9 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) bdput(bdev); if (victim) __blkdev_put(victim, mode, 1); - return ret; } -int blkdev_put(struct block_device *bdev, fmode_t mode) +void blkdev_put(struct block_device *bdev, fmode_t mode) { mutex_lock(&bdev->bd_mutex); @@ -1490,15 +1466,15 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) mutex_unlock(&bdev->bd_mutex); - return __blkdev_put(bdev, mode, 0); + __blkdev_put(bdev, mode, 0); } EXPORT_SYMBOL(blkdev_put); static int blkdev_close(struct inode * inode, struct file * filp) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); - - return blkdev_put(bdev, filp->f_mode); + blkdev_put(bdev, filp->f_mode); + return 0; } static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) @@ -1559,7 +1535,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, return 0; size -= pos; - if (size < INT_MAX) + if (size < iocb->ki_left) nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); return generic_file_aio_read(iocb, iov, nr_segs, pos); } @@ -1586,6 +1562,7 @@ static const struct address_space_operations def_blk_aops = { .writepages = generic_writepages, .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, + .is_dirty_writeback = buffer_check_dirty_writeback, }; const struct file_operations def_blk_fops = { diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 9a8622a5b867..2b3b83296977 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -1,5 +1,5 @@ config BTRFS_FS - tristate "Btrfs filesystem Unstable disk format" + tristate "Btrfs filesystem support" select LIBCRC32C select ZLIB_INFLATE select ZLIB_DEFLATE @@ -52,3 +52,23 @@ config BTRFS_FS_CHECK_INTEGRITY In most cases, unless you are a btrfs developer who needs to verify the integrity of (super)-block write requests during the run of a regression test, say N + +config BTRFS_FS_RUN_SANITY_TESTS + bool "Btrfs will run sanity tests upon loading" + depends on BTRFS_FS + help + This will run some basic sanity tests on the free space cache + code to make sure it is acting as it should. These are mostly + regression tests and are only really interesting to btrfs devlopers. + + If unsure, say N. + +config BTRFS_DEBUG + bool "Btrfs debugging support" + depends on BTRFS_FS + help + Enable run-time debugging support for the btrfs filesystem. This may + enable additional and expensive checks with negative impact on + performance, or export extra information via sysfs. + + If unsure, say N. diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index bd605c87adfd..eaf133384a8f 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -255,13 +255,11 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, * to a logical address */ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, - int search_commit_root, - u64 time_seq, - struct __prelim_ref *ref, - struct ulist *parents, - const u64 *extent_item_pos) + struct btrfs_path *path, u64 time_seq, + struct __prelim_ref *ref, + struct ulist *parents, + const u64 *extent_item_pos) { - struct btrfs_path *path; struct btrfs_root *root; struct btrfs_key root_key; struct extent_buffer *eb; @@ -269,11 +267,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, int root_level; int level = ref->level; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->search_commit_root = !!search_commit_root; - root_key.objectid = ref->root_id; root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = (u64)-1; @@ -314,7 +307,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, time_seq, ref->wanted_disk_byte, extent_item_pos); out: - btrfs_free_path(path); + path->lowest_level = 0; + btrfs_release_path(path); return ret; } @@ -322,7 +316,7 @@ out: * resolve all indirect backrefs from the list */ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, - int search_commit_root, u64 time_seq, + struct btrfs_path *path, u64 time_seq, struct list_head *head, const u64 *extent_item_pos) { @@ -349,9 +343,10 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; if (ref->count == 0) continue; - err = __resolve_indirect_ref(fs_info, search_commit_root, - time_seq, ref, parents, - extent_item_pos); + err = __resolve_indirect_ref(fs_info, path, time_seq, ref, + parents, extent_item_pos); + if (err == -ENOMEM) + goto out; if (err) continue; @@ -367,7 +362,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); if (!new_ref) { ret = -ENOMEM; - break; + goto out; } memcpy(new_ref, ref, sizeof(*ref)); new_ref->parent = node->val; @@ -377,7 +372,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, } ulist_reinit(parents); } - +out: ulist_free(parents); return ret; } @@ -421,7 +416,10 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(!ref->wanted_disk_byte); eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, fs_info->tree_root->leafsize, 0); - BUG_ON(!eb); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } btrfs_tree_read_lock(eb); if (btrfs_header_level(eb) == 0) btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); @@ -443,7 +441,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, * having a parent). * mode = 2: merge identical parents */ -static int __merge_refs(struct list_head *head, int mode) +static void __merge_refs(struct list_head *head, int mode) { struct list_head *pos1; @@ -489,7 +487,6 @@ static int __merge_refs(struct list_head *head, int mode) } } - return 0; } /* @@ -582,7 +579,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, default: WARN_ON(1); } - BUG_ON(ret); + if (ret) + return ret; } return 0; @@ -599,6 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, int slot; struct extent_buffer *leaf; struct btrfs_key key; + struct btrfs_key found_key; unsigned long ptr; unsigned long end; struct btrfs_extent_item *ei; @@ -616,17 +615,21 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); + btrfs_item_key_to_cpu(leaf, &found_key, slot); ptr = (unsigned long)(ei + 1); end = (unsigned long)ei + item_size; - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if (found_key.type == BTRFS_EXTENT_ITEM_KEY && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)ptr; *info_level = btrfs_tree_block_level(leaf, info); ptr += sizeof(struct btrfs_tree_block_info); BUG_ON(ptr > end); + } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) { + *info_level = found_key.offset; } else { BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); } @@ -680,7 +683,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, default: WARN_ON(1); } - BUG_ON(ret); + if (ret) + return ret; ptr += btrfs_extent_inline_ref_size(type); } @@ -762,7 +766,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, default: WARN_ON(1); } - BUG_ON(ret); + if (ret) + return ret; + } return ret; @@ -787,7 +793,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int info_level = 0; int ret; - int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT); struct list_head prefs_delayed; struct list_head prefs; struct __prelim_ref *ref; @@ -796,13 +801,17 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&prefs_delayed); key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = (u64)-1; + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->search_commit_root = !!search_commit_root; + if (!trans) + path->search_commit_root = 1; /* * grab both a lock on the path and a lock on the delayed ref head. @@ -817,7 +826,7 @@ again: goto out; BUG_ON(ret == 0); - if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) { + if (trans) { /* * look if there are updates for this ref queued and lock the * head @@ -861,7 +870,8 @@ again: slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid == bytenr && - key.type == BTRFS_EXTENT_ITEM_KEY) { + (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY)) { ret = __add_inline_refs(fs_info, path, bytenr, &info_level, &prefs); if (ret) @@ -880,18 +890,14 @@ again: if (ret) goto out; - ret = __merge_refs(&prefs, 1); - if (ret) - goto out; + __merge_refs(&prefs, 1); - ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, - &prefs, extent_item_pos); + ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, + extent_item_pos); if (ret) goto out; - ret = __merge_refs(&prefs, 2); - if (ret) - goto out; + __merge_refs(&prefs, 2); while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); @@ -900,7 +906,8 @@ again: if (ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); - BUG_ON(ret < 0); + if (ret < 0) + goto out; } if (ref->count && ref->parent) { struct extent_inode_elem *eie = NULL; @@ -911,7 +918,11 @@ again: info_level); eb = read_tree_block(fs_info->extent_root, ref->parent, bsz, 0); - BUG_ON(!eb); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + ret = -EIO; + goto out; + } ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie); ref->inode_list = eie; @@ -920,6 +931,8 @@ again: ret = ulist_add_merge(refs, ref->parent, (uintptr_t)ref->inode_list, (u64 *)&eie, GFP_NOFS); + if (ret < 0) + goto out; if (!ret && extent_item_pos) { /* * we've recorded that parent, so we must extend @@ -930,7 +943,6 @@ again: eie = eie->next; eie->next = ref->inode_list; } - BUG_ON(ret < 0); } kfree(ref); } @@ -1180,6 +1192,20 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, return ret; } +/* + * this iterates to turn a name (from iref/extref) into a full filesystem path. + * Elements of the path are separated by '/' and the path is guaranteed to be + * 0-terminated. the path is only given within the current file system. + * Therefore, it never starts with a '/'. the caller is responsible to provide + * "size" bytes in "dest". the dest buffer will be filled backwards. finally, + * the start point of the resulting string is returned. this pointer is within + * dest, normally. + * in case the path buffer would overflow, the pointer is decremented further + * as if output was written to the buffer, though no more output is actually + * generated. that way, the caller can determine how much space would be + * required for the path to fit into the buffer. in that case, the returned + * value will be smaller than dest. callers must check this! + */ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, struct extent_buffer *eb_in, u64 parent, @@ -1249,32 +1275,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, } /* - * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements - * of the path are separated by '/' and the path is guaranteed to be - * 0-terminated. the path is only given within the current file system. - * Therefore, it never starts with a '/'. the caller is responsible to provide - * "size" bytes in "dest". the dest buffer will be filled backwards. finally, - * the start point of the resulting string is returned. this pointer is within - * dest, normally. - * in case the path buffer would overflow, the pointer is decremented further - * as if output was written to the buffer, though no more output is actually - * generated. that way, the caller can determine how much space would be - * required for the path to fit into the buffer. in that case, the returned - * value will be smaller than dest. callers must check this! - */ -char *btrfs_iref_to_path(struct btrfs_root *fs_root, - struct btrfs_path *path, - struct btrfs_inode_ref *iref, - struct extent_buffer *eb_in, u64 parent, - char *dest, u32 size) -{ - return btrfs_ref_to_path(fs_root, path, - btrfs_inode_ref_name_len(eb_in, iref), - (unsigned long)(iref + 1), - eb_in, parent, dest, size); -} - -/* * this makes the path point to (logical EXTENT_ITEM *) * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for * tree blocks and <0 on error. @@ -1285,12 +1285,16 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, { int ret; u64 flags; + u64 size = 0; u32 item_size; struct extent_buffer *eb; struct btrfs_extent_item *ei; struct btrfs_key key; - key.type = BTRFS_EXTENT_ITEM_KEY; + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; key.objectid = logical; key.offset = (u64)-1; @@ -1303,9 +1307,15 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, return ret; btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); - if (found_key->type != BTRFS_EXTENT_ITEM_KEY || + if (found_key->type == BTRFS_METADATA_ITEM_KEY) + size = fs_info->extent_root->leafsize; + else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) + size = found_key->offset; + + if ((found_key->type != BTRFS_EXTENT_ITEM_KEY && + found_key->type != BTRFS_METADATA_ITEM_KEY) || found_key->objectid > logical || - found_key->objectid + found_key->offset <= logical) { + found_key->objectid + size <= logical) { pr_debug("logical %llu is not within any extent\n", (unsigned long long)logical); return -ENOENT; @@ -1461,9 +1471,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, iterate_extent_inodes_t *iterate, void *ctx) { int ret; - struct list_head data_refs = LIST_HEAD_INIT(data_refs); - struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct ulist *refs = NULL; struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; @@ -1475,9 +1483,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, pr_debug("resolving all inodes for extent %llu\n", extent_item_objectid); - if (search_commit_root) { - trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT; - } else { + if (!search_commit_root) { trans = btrfs_join_transaction(fs_info->extent_root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -1508,11 +1514,9 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, iterate, ctx); } ulist_free(roots); - roots = NULL; } free_leaf_list(refs); - ulist_free(roots); out: if (!search_commit_root) { btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 310a7f6d09b1..8f2e76702932 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -23,8 +23,6 @@ #include "ulist.h" #include "extent_io.h" -#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0) - struct inode_fs_paths { struct btrfs_path *btrfs_path; struct btrfs_root *fs_root; @@ -59,9 +57,6 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **roots); -char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, - struct btrfs_inode_ref *iref, struct extent_buffer *eb, - u64 parent, char *dest, u32 size); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, struct extent_buffer *eb_in, u64 parent, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d9b97d4960e6..08b286b2a2c5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -93,7 +93,7 @@ struct btrfs_inode { unsigned long runtime_flags; - /* Keep track of who's O_SYNC/fsycing currently */ + /* Keep track of who's O_SYNC/fsyncing currently */ atomic_t sync_writers; /* full 64 bit generation number, struct vfs_inode doesn't have a big diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 18af6f48781a..1431a6965017 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1700,7 +1700,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, unsigned int j; DECLARE_COMPLETION_ONSTACK(complete); - bio = bio_alloc(GFP_NOFS, num_pages - i); + bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); if (!bio) { printk(KERN_INFO "btrfsic: bio_alloc() for %u pages failed!\n", diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 15b94089abc4..b189bd1e7a3e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -82,6 +82,10 @@ struct compressed_bio { u32 sums; }; +static int btrfs_decompress_biovec(int type, struct page **pages_in, + u64 disk_start, struct bio_vec *bvec, + int vcnt, size_t srclen); + static inline int compressed_bio_size(struct btrfs_root *root, unsigned long disk_size) { @@ -106,7 +110,6 @@ static int check_compressed_csum(struct inode *inode, u64 disk_start) { int ret; - struct btrfs_root *root = BTRFS_I(inode)->root; struct page *page; unsigned long i; char *kaddr; @@ -121,7 +124,7 @@ static int check_compressed_csum(struct inode *inode, csum = ~(u32)0; kaddr = kmap_atomic(page); - csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE); + csum = btrfs_csum_data(kaddr, csum, PAGE_CACHE_SIZE); btrfs_csum_final(csum, (char *)&csum); kunmap_atomic(kaddr); @@ -739,7 +742,7 @@ static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; -struct btrfs_compress_op *btrfs_compress_op[] = { +static struct btrfs_compress_op *btrfs_compress_op[] = { &btrfs_zlib_compress, &btrfs_lzo_compress, }; @@ -910,8 +913,9 @@ int btrfs_compress_pages(int type, struct address_space *mapping, * be contiguous. They all correspond to the range of bytes covered by * the compressed extent. */ -int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, - struct bio_vec *bvec, int vcnt, size_t srclen) +static int btrfs_decompress_biovec(int type, struct page **pages_in, + u64 disk_start, struct bio_vec *bvec, + int vcnt, size_t srclen) { struct list_head *workspace; int ret; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 9afb0a62ae82..0c803b4fbf93 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -30,8 +30,6 @@ int btrfs_compress_pages(int type, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out, unsigned long max_out); -int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, - struct bio_vec *bvec, int vcnt, size_t srclen); int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ca9d8f1a3bb6..5bf4c39e2ad6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -37,16 +37,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); -static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot); +static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, + int level, int slot); static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); -struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, - u32 blocksize, u64 parent_transid, - u64 time_seq); -struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize, - u64 time_seq); +static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); struct btrfs_path *btrfs_alloc_path(void) { @@ -208,7 +203,7 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) * tree until you end up with a lock on the root. A locked buffer * is returned, with a reference held. */ -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) { struct extent_buffer *eb; @@ -361,6 +356,44 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) } /* + * Increment the upper half of tree_mod_seq, set lower half zero. + * + * Must be called with fs_info->tree_mod_seq_lock held. + */ +static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info) +{ + u64 seq = atomic64_read(&fs_info->tree_mod_seq); + seq &= 0xffffffff00000000ull; + seq += 1ull << 32; + atomic64_set(&fs_info->tree_mod_seq, seq); + return seq; +} + +/* + * Increment the lower half of tree_mod_seq. + * + * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers + * are generated should not technically require a spin lock here. (Rationale: + * incrementing the minor while incrementing the major seq number is between its + * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it + * just returns a unique sequence number as usual.) We have decided to leave + * that requirement in here and rethink it once we notice it really imposes a + * problem on some workload. + */ +static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info) +{ + return atomic64_inc_return(&fs_info->tree_mod_seq); +} + +/* + * return the last minor in the previous major tree_mod_seq number + */ +u64 btrfs_tree_mod_seq_prev(u64 seq) +{ + return (seq & 0xffffffff00000000ull) - 1ull; +} + +/* * This adds a new blocker to the tree mod log's blocker list if the @elem * passed does not already have a sequence number set. So when a caller expects * to record tree modifications, it should ensure to set elem->seq to zero @@ -376,10 +409,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, tree_mod_log_write_lock(fs_info); spin_lock(&fs_info->tree_mod_seq_lock); if (!elem->seq) { - elem->seq = btrfs_inc_tree_mod_seq(fs_info); + elem->seq = btrfs_inc_tree_mod_seq_major(fs_info); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); } - seq = btrfs_inc_tree_mod_seq(fs_info); + seq = btrfs_inc_tree_mod_seq_minor(fs_info); spin_unlock(&fs_info->tree_mod_seq_lock); tree_mod_log_write_unlock(fs_info); @@ -524,7 +557,10 @@ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, if (!tm) return -ENOMEM; - tm->seq = btrfs_inc_tree_mod_seq(fs_info); + spin_lock(&fs_info->tree_mod_seq_lock); + tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); + spin_unlock(&fs_info->tree_mod_seq_lock); + return tm->seq; } @@ -643,7 +679,8 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) static noinline int tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct extent_buffer *old_root, - struct extent_buffer *new_root, gfp_t flags) + struct extent_buffer *new_root, gfp_t flags, + int log_removal) { struct tree_mod_elem *tm; int ret; @@ -651,7 +688,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (tree_mod_dont_log(fs_info, NULL)) return 0; - __tree_mod_log_free_eb(fs_info, old_root); + if (log_removal) + __tree_mod_log_free_eb(fs_info, old_root); ret = tree_mod_alloc(fs_info, flags, &tm); if (ret < 0) @@ -738,7 +776,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) static noinline void tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, - unsigned long src_offset, int nr_items, int log_removal) + unsigned long src_offset, int nr_items) { int ret; int i; @@ -752,12 +790,10 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, } for (i = 0; i < nr_items; i++) { - if (log_removal) { - ret = tree_mod_log_insert_key_locked(fs_info, src, - i + src_offset, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); - } + ret = tree_mod_log_insert_key_locked(fs_info, src, + i + src_offset, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); ret = tree_mod_log_insert_key_locked(fs_info, dst, i + dst_offset, MOD_LOG_KEY_ADD); @@ -802,11 +838,12 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) static noinline void tree_mod_log_set_root_pointer(struct btrfs_root *root, - struct extent_buffer *new_root_node) + struct extent_buffer *new_root_node, + int log_removal) { int ret; ret = tree_mod_log_insert_root(root->fs_info, root->node, - new_root_node, GFP_NOFS); + new_root_node, GFP_NOFS, log_removal); BUG_ON(ret < 0); } @@ -867,7 +904,8 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (btrfs_block_can_be_shared(root, buf)) { ret = btrfs_lookup_extent_info(trans, root, buf->start, - buf->len, &refs, &flags); + btrfs_header_level(buf), 1, + &refs, &flags); if (ret) return ret; if (refs == 0) { @@ -913,10 +951,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ } if (new_flags != 0) { + int level = btrfs_header_level(buf); + ret = btrfs_set_disk_extent_flags(trans, root, buf->start, buf->len, - new_flags, 0); + new_flags, level, 0); if (ret) return ret; } @@ -1028,7 +1068,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = 0; extent_buffer_get(cow); - tree_mod_log_set_root_pointer(root, cow); + tree_mod_log_set_root_pointer(root, cow, 1); rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, root, buf, parent_start, @@ -1049,7 +1089,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - tree_mod_log_free_eb(root->fs_info, buf); + if (last_ref) + tree_mod_log_free_eb(root->fs_info, buf); btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); } @@ -1067,11 +1108,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, */ static struct tree_mod_elem * __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, - struct btrfs_root *root, u64 time_seq) + struct extent_buffer *eb_root, u64 time_seq) { struct tree_mod_elem *tm; struct tree_mod_elem *found = NULL; - u64 root_logical = root->node->start; + u64 root_logical = eb_root->start; int looped = 0; if (!time_seq) @@ -1105,7 +1146,6 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, found = tm; root_logical = tm->old_root.logical; - BUG_ON(root_logical == root->node->start); looped = 1; } @@ -1122,8 +1162,8 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, * time_seq). */ static void -__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, - struct tree_mod_elem *first_tm) +__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, + u64 time_seq, struct tree_mod_elem *first_tm) { u32 n; struct rb_node *next; @@ -1133,6 +1173,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, unsigned long p_size = sizeof(struct btrfs_key_ptr); n = btrfs_header_nritems(eb); + tree_mod_log_read_lock(fs_info); while (tm && tm->seq >= time_seq) { /* * all the operations are recorded with the operator used for @@ -1187,9 +1228,17 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, if (tm->index != first_tm->index) break; } + tree_mod_log_read_unlock(fs_info); btrfs_set_header_nritems(eb, n); } +/* + * Called with eb read locked. If the buffer cannot be rewinded, the same buffer + * is returned. If rewind operations happen, a fresh buffer is returned. The + * returned buffer is always read-locked. If the returned buffer is not the + * input buffer, the lock on the input buffer is released and the input buffer + * is freed (its refcount is decremented). + */ static struct extent_buffer * tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, u64 time_seq) @@ -1223,9 +1272,12 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, } extent_buffer_get(eb_rewin); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); - __tree_mod_log_rewind(eb_rewin, time_seq, tm); + extent_buffer_get(eb_rewin); + btrfs_tree_read_lock(eb_rewin); + __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); @@ -1243,33 +1295,35 @@ static inline struct extent_buffer * get_old_root(struct btrfs_root *root, u64 time_seq) { struct tree_mod_elem *tm; - struct extent_buffer *eb; + struct extent_buffer *eb = NULL; + struct extent_buffer *eb_root; struct extent_buffer *old; struct tree_mod_root *old_root = NULL; u64 old_generation = 0; u64 logical; u32 blocksize; - eb = btrfs_read_lock_root_node(root); - tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); + eb_root = btrfs_read_lock_root_node(root); + tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); if (!tm) - return root->node; + return eb_root; if (tm->op == MOD_LOG_ROOT_REPLACE) { old_root = &tm->old_root; old_generation = tm->generation; logical = old_root->logical; } else { - logical = root->node->start; + logical = eb_root->start; } tm = tree_mod_log_search(root->fs_info, logical, time_seq); if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { - btrfs_tree_read_unlock(root->node); - free_extent_buffer(root->node); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); blocksize = btrfs_level_size(root, old_root->level); old = read_tree_block(root, logical, blocksize, 0); - if (!old) { + if (!old || !extent_buffer_uptodate(old)) { + free_extent_buffer(old); pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", logical); WARN_ON(1); @@ -1278,13 +1332,13 @@ get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(old); } } else if (old_root) { - btrfs_tree_read_unlock(root->node); - free_extent_buffer(root->node); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); eb = alloc_dummy_extent_buffer(logical, root->nodesize); } else { - eb = btrfs_clone_extent_buffer(root->node); - btrfs_tree_read_unlock(root->node); - free_extent_buffer(root->node); + eb = btrfs_clone_extent_buffer(eb_root); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); } if (!eb) @@ -1294,12 +1348,12 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (old_root) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(eb, root->root_key.objectid); + btrfs_set_header_owner(eb, btrfs_header_owner(eb_root)); btrfs_set_header_level(eb, old_root->level); btrfs_set_header_generation(eb, old_generation); } if (tm) - __tree_mod_log_rewind(eb, time_seq, tm); + __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm); else WARN_ON(btrfs_header_level(eb) != 0); WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); @@ -1311,15 +1365,15 @@ int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) { struct tree_mod_elem *tm; int level; + struct extent_buffer *eb_root = btrfs_root_node(root); - tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); + tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { level = tm->old_root.level; } else { - rcu_read_lock(); - level = btrfs_header_level(root->node); - rcu_read_unlock(); + level = btrfs_header_level(eb_root); } + free_extent_buffer(eb_root); return level; } @@ -1514,8 +1568,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (!cur) { cur = read_tree_block(root, blocknr, blocksize, gen); - if (!cur) + if (!cur || !extent_buffer_uptodate(cur)) { + free_extent_buffer(cur); return -EIO; + } } else if (!uptodate) { err = btrfs_read_buffer(cur, gen); if (err) { @@ -1680,6 +1736,8 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, struct extent_buffer *parent, int slot) { int level = btrfs_header_level(parent); + struct extent_buffer *eb; + if (slot < 0) return NULL; if (slot >= btrfs_header_nritems(parent)) @@ -1687,9 +1745,15 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, BUG_ON(level == 0); - return read_tree_block(root, btrfs_node_blockptr(parent, slot), - btrfs_level_size(root, level - 1), - btrfs_node_ptr_generation(parent, slot)); + eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), + btrfs_level_size(root, level - 1), + btrfs_node_ptr_generation(parent, slot)); + if (eb && !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + eb = NULL; + } + + return eb; } /* @@ -1754,7 +1818,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } - tree_mod_log_set_root_pointer(root, child); + tree_mod_log_set_root_pointer(root, child, 1); rcu_assign_pointer(root->node, child); add_root_to_dirty_list(root); @@ -1818,7 +1882,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - del_ptr(trans, root, path, level + 1, pslot + 1); + del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); btrfs_free_tree_block(trans, root, right, 0, 1); free_extent_buffer_stale(right); @@ -1862,7 +1926,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - del_ptr(trans, root, path, level + 1, pslot); + del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1); free_extent_buffer_stale(mid); @@ -2117,12 +2181,8 @@ static void reada_for_search(struct btrfs_root *root, } } -/* - * returns -EAGAIN if it had to drop the path, or zero if everything was in - * cache - */ -static noinline int reada_for_balance(struct btrfs_root *root, - struct btrfs_path *path, int level) +static noinline void reada_for_balance(struct btrfs_root *root, + struct btrfs_path *path, int level) { int slot; int nritems; @@ -2131,12 +2191,11 @@ static noinline int reada_for_balance(struct btrfs_root *root, u64 gen; u64 block1 = 0; u64 block2 = 0; - int ret = 0; int blocksize; parent = path->nodes[level + 1]; if (!parent) - return 0; + return; nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; @@ -2163,28 +2222,11 @@ static noinline int reada_for_balance(struct btrfs_root *root, block2 = 0; free_extent_buffer(eb); } - if (block1 || block2) { - ret = -EAGAIN; - - /* release the whole path */ - btrfs_release_path(path); - - /* read the blocks */ - if (block1) - readahead_tree_block(root, block1, blocksize, 0); - if (block2) - readahead_tree_block(root, block2, blocksize, 0); - if (block1) { - eb = read_tree_block(root, block1, blocksize, 0); - free_extent_buffer(eb); - } - if (block2) { - eb = read_tree_block(root, block2, blocksize, 0); - free_extent_buffer(eb); - } - } - return ret; + if (block1) + readahead_tree_block(root, block1, blocksize, 0); + if (block2) + readahead_tree_block(root, block2, blocksize, 0); } @@ -2210,9 +2252,6 @@ static noinline void unlock_up(struct btrfs_path *path, int level, int no_skips = 0; struct extent_buffer *t; - if (path->really_keep_locks) - return; - for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i]) break; @@ -2260,7 +2299,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) { int i; - if (path->keep_locks || path->really_keep_locks) + if (path->keep_locks) return; for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -2301,35 +2340,28 @@ read_block_for_search(struct btrfs_trans_handle *trans, tmp = btrfs_find_tree_block(root, blocknr, blocksize); if (tmp) { /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) { - if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { - /* - * we found an up to date block without - * sleeping, return - * right away - */ - *eb_ret = tmp; - return 0; - } - /* the pages were up to date, but we failed - * the generation number check. Do a full - * read for the generation number that is correct. - * We must do this without dropping locks so - * we can trust our generation number - */ - free_extent_buffer(tmp); - btrfs_set_path_blocking(p); + if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + *eb_ret = tmp; + return 0; + } - /* now we're allowed to do a blocking uptodate check */ - tmp = read_tree_block(root, blocknr, blocksize, gen); - if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) { - *eb_ret = tmp; - return 0; - } - free_extent_buffer(tmp); - btrfs_release_path(p); - return -EIO; + /* the pages were up to date, but we failed + * the generation number check. Do a full + * read for the generation number that is correct. + * We must do this without dropping locks so + * we can trust our generation number + */ + btrfs_set_path_blocking(p); + + /* now we're allowed to do a blocking uptodate check */ + ret = btrfs_read_buffer(tmp, gen); + if (!ret) { + *eb_ret = tmp; + return 0; } + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EIO; } /* @@ -2390,11 +2422,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, goto again; } - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - btrfs_set_path_blocking(p); + reada_for_balance(root, p, level); sret = split_node(trans, root, p, level); btrfs_clear_path_blocking(p, NULL, 0); @@ -2414,11 +2443,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, goto again; } - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - btrfs_set_path_blocking(p); + reada_for_balance(root, p, level); sret = balance_level(trans, root, p, level); btrfs_clear_path_blocking(p, NULL, 0); @@ -2493,7 +2519,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (!cow) write_lock_level = -1; - if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level)) + if (cow && (p->keep_locks || p->lowest_level)) write_lock_level = BTRFS_MAX_LEVEL; min_write_lock_level = write_lock_level; @@ -2795,15 +2821,9 @@ again: btrfs_clear_path_blocking(p, b, BTRFS_READ_LOCK); } + b = tree_mod_log_rewind(root->fs_info, b, time_seq); p->locks[level] = BTRFS_READ_LOCK; p->nodes[level] = b; - b = tree_mod_log_rewind(root->fs_info, b, time_seq); - if (b != p->nodes[level]) { - btrfs_tree_unlock_rw(p->nodes[level], - p->locks[level]); - p->locks[level] = 0; - p->nodes[level] = b; - } } else { p->slots[level] = slot; unlock_up(p, level, lowest_unlock, 0, NULL); @@ -2902,8 +2922,7 @@ again: * higher levels * */ -static void fixup_low_keys(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, +static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_disk_key *key, int level) { int i; @@ -2928,8 +2947,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, +void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *new_key) { struct btrfs_disk_key disk_key; @@ -2951,7 +2969,7 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, btrfs_set_item_key(eb, &disk_key, slot); btrfs_mark_buffer_dirty(eb); if (slot == 0) - fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(root, path, &disk_key, 1); } /* @@ -2998,7 +3016,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items = min(src_nritems - 8, push_items); tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, - push_items, 1); + push_items); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), @@ -3069,7 +3087,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, sizeof(struct btrfs_key_ptr)); tree_mod_log_eb_copy(root->fs_info, dst, src, 0, - src_nritems - push_items, push_items, 1); + src_nritems - push_items, push_items); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3144,7 +3162,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); old = root->node; - tree_mod_log_set_root_pointer(root, c); + tree_mod_log_set_root_pointer(root, c, 0); rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ @@ -3221,18 +3239,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans, int mid; int ret; u32 c_nritems; - int tree_mod_log_removal = 1; c = path->nodes[level]; WARN_ON(btrfs_header_generation(c) != trans->transid); if (c == root->node) { - /* trying to split the root, lets make a new one */ - ret = insert_new_root(trans, root, path, level + 1); /* - * removal of root nodes has been logged by - * tree_mod_log_set_root_pointer due to locking + * trying to split the root, lets make a new one + * + * tree mod log: We don't log_removal old root in + * insert_new_root, because that root buffer will be kept as a + * normal node. We are going to log removal of half of the + * elements below with tree_mod_log_eb_copy. We're holding a + * tree lock on the buffer, which is why we cannot race with + * other tree_mod_log users. */ - tree_mod_log_removal = 0; + ret = insert_new_root(trans, root, path, level + 1); if (ret) return ret; } else { @@ -3270,8 +3291,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid, - tree_mod_log_removal); + tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), @@ -3687,7 +3707,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, clean_tree_block(trans, root, right); btrfs_item_key(right, &disk_key, 0); - fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(root, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -3934,7 +3954,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, return -EOVERFLOW; /* first try to make some room by pushing left and right */ - if (data_size) { + if (data_size && path->nodes[1]) { wret = push_leaf_right(trans, root, path, data_size, data_size, 0, 0); if (wret < 0) @@ -4047,8 +4067,7 @@ again: path->nodes[0] = right; path->slots[0] = 0; if (path->slots[1] == 0) - fixup_low_keys(trans, root, path, - &disk_key, 1); + fixup_low_keys(root, path, &disk_key, 1); } btrfs_mark_buffer_dirty(right); return ret; @@ -4264,7 +4283,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - setup_items_for_insert(trans, root, path, new_key, &item_size, + setup_items_for_insert(root, path, new_key, &item_size, item_size, item_size + sizeof(struct btrfs_item), 1); leaf = path->nodes[0]; @@ -4281,9 +4300,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * off the end of the item or if we shift the item to chop bytes off * the front. */ -void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, +void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, u32 new_size, int from_end) { int slot; @@ -4367,7 +4384,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, btrfs_set_disk_key_offset(&disk_key, offset + size_diff); btrfs_set_item_key(leaf, &disk_key, slot); if (slot == 0) - fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(root, path, &disk_key, 1); } item = btrfs_item_nr(leaf, slot); @@ -4381,10 +4398,9 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, } /* - * make the item pointed to by the path bigger, data_size is the new size. + * make the item pointed to by the path bigger, data_size is the added size. */ -void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, +void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, u32 data_size) { int slot; @@ -4454,8 +4470,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, * to save stack depth by doing the bulk of the work in a function * that doesn't call btrfs_search_slot */ -void setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, +void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *cpu_key, u32 *data_size, u32 total_data, u32 total_size, int nr) { @@ -4531,7 +4546,7 @@ void setup_items_for_insert(struct btrfs_trans_handle *trans, if (slot == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(root, path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(leaf); @@ -4571,7 +4586,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(trans, root, path, cpu_key, data_size, + setup_items_for_insert(root, path, cpu_key, data_size, total_data, total_size, nr); return 0; } @@ -4609,8 +4624,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root * the tree should have been previously balanced so the deletion does not * empty a node. */ -static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot) +static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, + int level, int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; @@ -4642,7 +4657,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - fixup_low_keys(trans, root, path, &disk_key, level + 1); + fixup_low_keys(root, path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); } @@ -4663,7 +4678,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, struct extent_buffer *leaf) { WARN_ON(btrfs_header_generation(leaf) != trans->transid); - del_ptr(trans, root, path, 1, path->slots[1]); + del_ptr(root, path, 1, path->slots[1]); /* * btrfs_free_extent is expensive, we want to make sure we @@ -4744,7 +4759,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(root, path, &disk_key, 1); } /* delete the leaf if it is mostly empty */ @@ -5464,139 +5479,6 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) return btrfs_next_old_leaf(root, path, 0); } -/* Release the path up to but not including the given level */ -static void btrfs_release_level(struct btrfs_path *path, int level) -{ - int i; - - for (i = 0; i < level; i++) { - path->slots[i] = 0; - if (!path->nodes[i]) - continue; - if (path->locks[i]) { - btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); - path->locks[i] = 0; - } - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; - } -} - -/* - * This function assumes 2 things - * - * 1) You are using path->keep_locks - * 2) You are not inserting items. - * - * If either of these are not true do not use this function. If you need a next - * leaf with either of these not being true then this function can be easily - * adapted to do that, but at the moment these are the limitations. - */ -int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - int del) -{ - struct extent_buffer *b; - struct btrfs_key key; - u32 nritems; - int level = 1; - int slot; - int ret = 1; - int write_lock_level = BTRFS_MAX_LEVEL; - int ins_len = del ? -1 : 0; - - WARN_ON(!(path->keep_locks || path->really_keep_locks)); - - nritems = btrfs_header_nritems(path->nodes[0]); - btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); - - while (path->nodes[level]) { - nritems = btrfs_header_nritems(path->nodes[level]); - if (!(path->locks[level] & BTRFS_WRITE_LOCK)) { -search: - btrfs_release_path(path); - ret = btrfs_search_slot(trans, root, &key, path, - ins_len, 1); - if (ret < 0) - goto out; - level = 1; - continue; - } - - if (path->slots[level] >= nritems - 1) { - level++; - continue; - } - - btrfs_release_level(path, level); - break; - } - - if (!path->nodes[level]) { - ret = 1; - goto out; - } - - path->slots[level]++; - b = path->nodes[level]; - - while (b) { - level = btrfs_header_level(b); - - if (!should_cow_block(trans, root, b)) - goto cow_done; - - btrfs_set_path_blocking(path); - ret = btrfs_cow_block(trans, root, b, - path->nodes[level + 1], - path->slots[level + 1], &b); - if (ret) - goto out; -cow_done: - path->nodes[level] = b; - btrfs_clear_path_blocking(path, NULL, 0); - if (level != 0) { - ret = setup_nodes_for_search(trans, root, path, b, - level, ins_len, - &write_lock_level); - if (ret == -EAGAIN) - goto search; - if (ret) - goto out; - - b = path->nodes[level]; - slot = path->slots[level]; - - ret = read_block_for_search(trans, root, path, - &b, level, slot, &key, 0); - if (ret == -EAGAIN) - goto search; - if (ret) - goto out; - level = btrfs_header_level(b); - if (!btrfs_try_tree_write_lock(b)) { - btrfs_set_path_blocking(path); - btrfs_tree_lock(b); - btrfs_clear_path_blocking(path, b, - BTRFS_WRITE_LOCK); - } - path->locks[level] = BTRFS_WRITE_LOCK; - path->nodes[level] = b; - path->slots[level] = 0; - } else { - path->slots[level] = 0; - ret = 0; - break; - } - } - -out: - if (ret) - btrfs_release_path(path); - - return ret; -} - int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0d82922179db..e795bf135e80 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -88,12 +88,12 @@ struct btrfs_ordered_sum; /* holds checksums of all the data extents */ #define BTRFS_CSUM_TREE_OBJECTID 7ULL -/* for storing balance parameters in the root tree */ -#define BTRFS_BALANCE_OBJECTID -4ULL - /* holds quota configuration and tracking */ #define BTRFS_QUOTA_TREE_OBJECTID 8ULL +/* for storing balance parameters in the root tree */ +#define BTRFS_BALANCE_OBJECTID -4ULL + /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL @@ -340,6 +340,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) */ #define BTRFS_FS_STATE_ERROR 0 #define BTRFS_FS_STATE_REMOUNTING 1 +#define BTRFS_FS_STATE_TRANS_ABORTED 2 /* Super block flags */ /* Errors detected */ @@ -508,6 +509,7 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) #define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) +#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL @@ -518,7 +520,8 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ BTRFS_FEATURE_INCOMPAT_RAID56 | \ - BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) /* * A leaf is full of items. offset and size tell us where to find @@ -583,7 +586,6 @@ struct btrfs_path { unsigned int skip_locking:1; unsigned int leave_spinning:1; unsigned int search_commit_root:1; - unsigned int really_keep_locks:1; }; /* @@ -959,8 +961,8 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) -#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) -#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) +#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) +#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE enum btrfs_raid_types { @@ -1019,9 +1021,9 @@ struct btrfs_block_group_item { */ #define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) /* - * SCANNING is set during the initialization phase + * RESCAN is set during the initialization phase */ -#define BTRFS_QGROUP_STATUS_FLAG_SCANNING (1ULL << 1) +#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1) /* * Some qgroup entries are known to be out of date, * either because the configuration has changed in a way that @@ -1050,7 +1052,7 @@ struct btrfs_qgroup_status_item { * only used during scanning to record the progress * of the scan. It contains a logical address */ - __le64 scan; + __le64 rescan; } __attribute__ ((__packed__)); struct btrfs_qgroup_info_item { @@ -1100,6 +1102,18 @@ struct btrfs_space_info { account */ /* + * bytes_pinned is kept in line with what is actually pinned, as in + * we've called update_block_group and dropped the bytes_used counter + * and increased the bytes_pinned counter. However this means that + * bytes_pinned does not reflect the bytes that will be pinned once the + * delayed refs are flushed, so this counter is inc'ed everytime we call + * btrfs_free_extent so it is a realtime count of what will be freed + * once the transaction is committed. It will be zero'ed everytime the + * transaction commits. + */ + struct percpu_counter total_bytes_pinned; + + /* * we bump reservation progress every time we decrement * bytes_reserved. This way people waiting for reservations * know something good has happened and they can check @@ -1360,6 +1374,17 @@ struct btrfs_fs_info { wait_queue_head_t transaction_blocked_wait; wait_queue_head_t async_submit_wait; + /* + * Used to protect the incompat_flags, compat_flags, compat_ro_flags + * when they are updated. + * + * Because we do not clear the flags for ever, so we needn't use + * the lock on the read side. + * + * We also needn't use the lock when we mount the fs, because + * there is no other task which will update the flag. + */ + spinlock_t super_lock; struct btrfs_super_block *super_copy; struct btrfs_super_block *super_for_commit; struct block_device *__bdev; @@ -1409,7 +1434,7 @@ struct btrfs_fs_info { /* this protects tree_mod_seq_list */ spinlock_t tree_mod_seq_lock; - atomic_t tree_mod_seq; + atomic64_t tree_mod_seq; struct list_head tree_mod_seq_list; struct seq_list tree_mod_seq_elem; @@ -1424,25 +1449,22 @@ struct btrfs_fs_info { atomic_t open_ioctl_trans; /* - * this is used by the balancing code to wait for all the pending - * ordered extents + * this is used to protect the following list -- ordered_roots. */ - spinlock_t ordered_extent_lock; + spinlock_t ordered_root_lock; /* - * all of the data=ordered extents pending writeback + * all fs/file tree roots in which there are data=ordered extents + * pending writeback are added into this list. + * * these can span multiple transactions and basically include * every dirty data page that isn't from nodatacow */ - struct list_head ordered_extents; + struct list_head ordered_roots; - spinlock_t delalloc_lock; - /* - * all of the inodes that have delalloc bytes. It is possible for - * this list to be empty even when there is still dirty data=ordered - * extents waiting to finish IO. - */ - struct list_head delalloc_inodes; + spinlock_t delalloc_root_lock; + /* all fs/file tree roots that have delalloc inodes. */ + struct list_head delalloc_roots; /* * there is a pool of worker threads for checksumming during writes @@ -1485,8 +1507,6 @@ struct btrfs_fs_info { int do_barriers; int closing; int log_root_recovering; - int enospc_unlink; - int trans_no_join; u64 total_pinned; @@ -1581,12 +1601,28 @@ struct btrfs_fs_info { struct rb_root qgroup_tree; spinlock_t qgroup_lock; + /* + * used to avoid frequently calling ulist_alloc()/ulist_free() + * when doing qgroup accounting, it must be protected by qgroup_lock. + */ + struct ulist *qgroup_ulist; + + /* protect user change for quota operations */ + struct mutex qgroup_ioctl_lock; + /* list of dirty qgroups to be written at next commit */ struct list_head dirty_qgroups; /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ u64 qgroup_seq; + /* qgroup rescan items */ + struct mutex qgroup_rescan_lock; /* protects the progress item */ + struct btrfs_key qgroup_rescan_progress; + struct btrfs_workers qgroup_rescan_workers; + struct completion qgroup_rescan_completion; + struct btrfs_work qgroup_rescan_work; + /* filesystem state */ unsigned long fs_state; @@ -1718,6 +1754,31 @@ struct btrfs_root { int force_cow; spinlock_t root_item_lock; + atomic_t refs; + + spinlock_t delalloc_lock; + /* + * all of the inodes that have delalloc bytes. It is possible for + * this list to be empty even when there is still dirty data=ordered + * extents waiting to finish IO. + */ + struct list_head delalloc_inodes; + struct list_head delalloc_root; + u64 nr_delalloc_inodes; + /* + * this is used by the balancing code to wait for all the pending + * ordered extents + */ + spinlock_t ordered_extent_lock; + + /* + * all of the data=ordered extents pending writeback + * these can span multiple transactions and basically include + * every dirty data page that isn't from nodatacow + */ + struct list_head ordered_extents; + struct list_head ordered_root; + u64 nr_ordered_extents; }; struct btrfs_ioctl_defrag_range_args { @@ -1808,6 +1869,12 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_EXTENT_ITEM_KEY 168 +/* + * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know + * the length, so we save the level in key->offset instead of the length. + */ +#define BTRFS_METADATA_ITEM_KEY 169 + #define BTRFS_TREE_BLOCK_REF_KEY 176 #define BTRFS_EXTENT_DATA_REF_KEY 178 @@ -2766,8 +2833,10 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, static inline int btrfs_super_csum_size(struct btrfs_super_block *s) { - int t = btrfs_super_csum_type(s); - BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes)); + u16 t = btrfs_super_csum_type(s); + /* + * csum type is validated at mount time + */ return btrfs_csum_sizes[t]; } @@ -2864,8 +2933,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, version, 64); BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, flags, 64); -BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item, - scan, 64); +BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, + rescan, 64); /* btrfs_qgroup_info_item */ BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, @@ -2999,17 +3068,21 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, num_items; } +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags); + u64 offset, int metadata, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, u64 bytenr, u64 num_bytes); +int btrfs_exclude_logged_extents(struct btrfs_root *root, + struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr); @@ -3017,8 +3090,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); -u64 btrfs_find_block_group(struct btrfs_root *root, - u64 search_start, u64 search_hint, int owner); struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, @@ -3028,10 +3099,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, u64 parent, int last_ref); -struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - int level); int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, @@ -3044,7 +3111,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, u64 data); + struct btrfs_key *ins, int is_data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref, int for_cow); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3052,7 +3119,7 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, - int is_data); + int level, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, @@ -3084,7 +3151,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); -u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); @@ -3133,6 +3199,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor); void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); @@ -3161,8 +3230,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); -void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, +void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); @@ -3198,12 +3266,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); -void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, +void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, u32 data_size); -void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, +void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3243,8 +3308,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, return btrfs_del_items(trans, root, path, path->slots[0], 1); } -void setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, +void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *cpu_key, u32 *data_size, u32 total_data, u32 total_size, int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root @@ -3264,9 +3328,6 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); -int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - int del); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); static inline int btrfs_next_old_item(struct btrfs_root *root, @@ -3281,7 +3342,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { return btrfs_next_old_item(root, p, 0); } -int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); int __must_check btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, @@ -3298,6 +3358,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) smp_mb(); return fs_info->closing; } + +/* + * If we remount the fs to be R/O or umount the fs, the cleaner needn't do + * anything except sleeping. This function is used to check the status of + * the fs. + */ +static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root) +{ + return (root->fs_info->sb->s_flags & MS_RDONLY || + btrfs_fs_closing(root->fs_info)); +} + static inline void free_fs_info(struct btrfs_fs_info *fs_info) { kfree(fs_info->balance_ctl); @@ -3318,10 +3390,7 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); -static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) -{ - return atomic_inc_return(&fs_info->tree_mod_seq); -} +u64 btrfs_tree_mod_seq_prev(u64 seq); int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ @@ -3345,12 +3414,11 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item); -void btrfs_read_root_item(struct btrfs_root *root, - struct extent_buffer *eb, int slot, - struct btrfs_root_item *item); -int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct - btrfs_root_item *item, struct btrfs_key *key); -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); +void btrfs_read_root_item(struct extent_buffer *eb, int slot, + struct btrfs_root_item *item); +int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key); int btrfs_find_orphan_roots(struct btrfs_root *tree_root); void btrfs_set_root_node(struct btrfs_root_item *item, struct extent_buffer *node); @@ -3380,9 +3448,6 @@ struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, const char *name, int name_len); -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len); int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -3460,16 +3525,11 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, u64 bytenr, int mod); -u64 btrfs_file_extent_length(struct btrfs_path *path); int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig); -struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, int cow); int btrfs_csum_truncate(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 isize); @@ -3492,6 +3552,10 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create); +noinline int can_nocow_extent(struct btrfs_trans_handle *trans, + struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes); /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ #if defined(ClearPageFsMisc) && !defined(ClearPageChecked) @@ -3529,10 +3593,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); +int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info, + int delay_iput); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); -int btrfs_writepages(struct address_space *mapping, - struct writeback_control *wbc); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, u64 new_dirid); int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, @@ -3542,7 +3606,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); -int btrfs_dirty_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); @@ -3560,7 +3623,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); -int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3611,7 +3673,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); -void btrfs_drop_pages(struct page **pages, size_t num_pages); int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, @@ -3634,14 +3695,31 @@ int btrfs_sync_fs(struct super_block *sb, int wait); #ifdef CONFIG_PRINTK __printf(2, 3) -void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...); +void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); #else static inline __printf(2, 3) -void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) +void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { } #endif +#define btrfs_emerg(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_INFO fmt, ##args) +#define btrfs_debug(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) + __printf(5, 6) void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); @@ -3663,11 +3741,28 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, disk_super = fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (!(features & flag)) { - features |= flag; - btrfs_set_super_incompat_flags(disk_super, features); + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_incompat_flags(disk_super, features); + printk(KERN_INFO "btrfs: setting %llu feature flag\n", + flag); + } + spin_unlock(&fs_info->super_lock); } } +#define btrfs_fs_incompat(fs_info, opt) \ + __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) +{ + struct btrfs_super_block *disk_super; + disk_super = fs_info->super_copy; + return !!(btrfs_super_incompat_flags(disk_super) & flag); +} + /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. @@ -3753,7 +3848,6 @@ void btrfs_scrub_continue_super(struct btrfs_root *root); int btrfs_scrub_cancel(struct btrfs_fs_info *info); int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, struct btrfs_device *dev); -int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress); @@ -3784,7 +3878,9 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_quota_disable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -int btrfs_quota_rescan(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); +void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 14fce27b4780..375510913fe7 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -202,7 +202,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, spin_unlock(&root->lock); } -struct btrfs_delayed_node *btrfs_first_delayed_node( +static struct btrfs_delayed_node *btrfs_first_delayed_node( struct btrfs_delayed_root *delayed_root) { struct list_head *p; @@ -221,7 +221,7 @@ out: return node; } -struct btrfs_delayed_node *btrfs_next_delayed_node( +static struct btrfs_delayed_node *btrfs_next_delayed_node( struct btrfs_delayed_node *node) { struct btrfs_delayed_root *delayed_root; @@ -282,7 +282,7 @@ static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) __btrfs_release_delayed_node(node, 0); } -struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( +static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( struct btrfs_delayed_root *delayed_root) { struct list_head *p; @@ -308,7 +308,7 @@ static inline void btrfs_release_prepared_delayed_node( __btrfs_release_delayed_node(node, 1); } -struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) +static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) { struct btrfs_delayed_item *item; item = kmalloc(sizeof(*item) + data_len, GFP_NOFS); @@ -383,7 +383,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( return NULL; } -struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( +static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( struct btrfs_delayed_node *delayed_node, struct btrfs_key *key) { @@ -394,45 +394,6 @@ struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( return item; } -struct btrfs_delayed_item *__btrfs_lookup_delayed_deletion_item( - struct btrfs_delayed_node *delayed_node, - struct btrfs_key *key) -{ - struct btrfs_delayed_item *item; - - item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key, - NULL, NULL); - return item; -} - -struct btrfs_delayed_item *__btrfs_search_delayed_insertion_item( - struct btrfs_delayed_node *delayed_node, - struct btrfs_key *key) -{ - struct btrfs_delayed_item *item, *next; - - item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key, - NULL, &next); - if (!item) - item = next; - - return item; -} - -struct btrfs_delayed_item *__btrfs_search_delayed_deletion_item( - struct btrfs_delayed_node *delayed_node, - struct btrfs_key *key) -{ - struct btrfs_delayed_item *item, *next; - - item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key, - NULL, &next); - if (!item) - item = next; - - return item; -} - static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, struct btrfs_delayed_item *ins, int action) @@ -535,7 +496,7 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) } } -struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( +static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( struct btrfs_delayed_node *delayed_node) { struct rb_node *p; @@ -548,7 +509,7 @@ struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( return item; } -struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( +static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( struct btrfs_delayed_node *delayed_node) { struct rb_node *p; @@ -561,7 +522,7 @@ struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( return item; } -struct btrfs_delayed_item *__btrfs_next_delayed_item( +static struct btrfs_delayed_item *__btrfs_next_delayed_item( struct btrfs_delayed_item *item) { struct rb_node *p; @@ -574,20 +535,6 @@ struct btrfs_delayed_item *__btrfs_next_delayed_item( return next; } -static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root, - u64 root_id) -{ - struct btrfs_key root_key; - - if (root->objectid == root_id) - return root; - - root_key.objectid = root_id; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - return btrfs_read_fs_root_no_name(root->fs_info, &root_key); -} - static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_delayed_item *item) @@ -766,10 +713,9 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, * This helper will insert some continuous items into the same leaf according * to the free space of the leaf. */ -static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_delayed_item *item) +static int btrfs_batch_insert_items(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_item *item) { struct btrfs_delayed_item *curr, *next; int free_space; @@ -848,7 +794,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, btrfs_clear_path_blocking(path, NULL, 0); /* insert the keys of the items */ - setup_items_for_insert(trans, root, path, keys, data_size, + setup_items_for_insert(root, path, keys, data_size, total_data_size, total_size, nitems); /* insert the dir index items */ @@ -932,7 +878,7 @@ do_again: if (curr && btrfs_is_continuous_delayed_item(prev, curr)) { /* insert the continuous items into the same leaf */ path->slots[0]++; - btrfs_batch_insert_items(trans, root, path, curr); + btrfs_batch_insert_items(root, path, curr); } btrfs_release_delayed_item(prev); btrfs_mark_buffer_dirty(path->nodes[0]); @@ -1721,8 +1667,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree * */ -int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, - filldir_t filldir, +int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, struct list_head *ins_list) { struct btrfs_dir_item *di; @@ -1744,13 +1689,13 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, list_for_each_entry_safe(curr, next, ins_list, readdir_list) { list_del(&curr->readdir_list); - if (curr->key.offset < filp->f_pos) { + if (curr->key.offset < ctx->pos) { if (atomic_dec_and_test(&curr->refs)) kfree(curr); continue; } - filp->f_pos = curr->key.offset; + ctx->pos = curr->key.offset; di = (struct btrfs_dir_item *)curr->data; name = (char *)(di + 1); @@ -1759,7 +1704,7 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, d_type = btrfs_filetype_table[di->type]; btrfs_disk_key_to_cpu(&location, &di->location); - over = filldir(dirent, name, name_len, curr->key.offset, + over = !dir_emit(ctx, name, name_len, location.objectid, d_type); if (atomic_dec_and_test(&curr->refs)) diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 1d5c5f7abe3e..a4b38f934d14 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -139,8 +139,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list, struct list_head *del_list); int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index); -int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, - filldir_t filldir, +int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, struct list_head *ins_list); /* for init */ diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index b7a0641ead77..c219463fb1fd 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -40,16 +40,19 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep; * compare two delayed tree backrefs with same bytenr and type */ static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, - struct btrfs_delayed_tree_ref *ref1) + struct btrfs_delayed_tree_ref *ref1, int type) { - if (ref1->root < ref2->root) - return -1; - if (ref1->root > ref2->root) - return 1; - if (ref1->parent < ref2->parent) - return -1; - if (ref1->parent > ref2->parent) - return 1; + if (type == BTRFS_TREE_BLOCK_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } return 0; } @@ -113,7 +116,8 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2, if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), - btrfs_delayed_node_to_tree_ref(ref1)); + btrfs_delayed_node_to_tree_ref(ref1), + ref1->type); } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || ref1->type == BTRFS_SHARED_DATA_REF_KEY) { return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), @@ -357,8 +361,10 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, elem = list_first_entry(&fs_info->tree_mod_seq_list, struct seq_list, list); if (seq >= elem->seq) { - pr_debug("holding back delayed_ref %llu, lowest is " - "%llu (%p)\n", seq, elem->seq, delayed_refs); + pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n", + (u32)(seq >> 32), (u32)seq, + (u32)(elem->seq >> 32), (u32)elem->seq, + delayed_refs); ret = 1; } } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index f75fcaf79aeb..70b962cc177d 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -60,6 +60,7 @@ struct btrfs_delayed_ref_node { struct btrfs_delayed_extent_op { struct btrfs_disk_key key; u64 flags_to_set; + int level; unsigned int update_key:1; unsigned int update_flags:1; unsigned int is_data:1; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 7ba7b3900cb8..4253ad580e39 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -313,6 +313,11 @@ int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; + if (btrfs_fs_incompat(fs_info, RAID56)) { + pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n"); + return -EINVAL; + } + switch (args->start.cont_reading_from_srcdev_mode) { case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: @@ -395,7 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace); - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root->fs_info, 0); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -465,12 +470,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_inodes(root, 0); + ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root->fs_info, 0); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 502c2158167c..79e594e341c7 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -21,6 +21,10 @@ #include "hash.h" #include "transaction.h" +static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len); + /* * insert a name into a directory, doing overflow properly if there is a hash * collision. data_size indicates how big the item inserted should be. On @@ -49,7 +53,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle di = btrfs_match_dir_item_name(root, path, name, name_len); if (di) return ERR_PTR(-EEXIST); - btrfs_extend_item(trans, root, path, data_size); + btrfs_extend_item(root, path, data_size); } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); @@ -379,7 +383,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, * this walks through all the entries in a dir item and finds one * for a specific name. */ -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, +static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path, const char *name, int name_len) { @@ -442,8 +446,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); - btrfs_truncate_item(trans, root, path, - item_len - sub_item_len, 1); + btrfs_truncate_item(root, path, item_len - sub_item_len, 1); } return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6d19a0a554aa..6b092a1c4e37 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -30,6 +30,7 @@ #include <linux/slab.h> #include <linux/migrate.h> #include <linux/ratelimit.h> +#include <linux/uuid.h> #include <asm/unaligned.h> #include "compat.h" #include "ctree.h" @@ -69,6 +70,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, int mark); static int btrfs_destroy_pinned_extent(struct btrfs_root *root, struct extent_io_tree *pinned_extents); +static int btrfs_cleanup_transaction(struct btrfs_root *root); +static void btrfs_error_commit_super(struct btrfs_root *root); /* * end_io_wq structs are used to do processing in task context when an IO is @@ -149,7 +152,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" }, { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" }, { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" }, - { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" }, + { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" }, { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, @@ -222,7 +225,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); + ret = add_extent_mapping(em_tree, em, 0); if (ret == -EEXIST) { free_extent_map(em); em = lookup_extent_mapping(em_tree, start, len); @@ -238,7 +241,7 @@ out: return em; } -u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) +u32 btrfs_csum_data(char *data, u32 seed, size_t len) { return crc32c(seed, data, len); } @@ -274,7 +277,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, if (err) return 1; cur_len = min(len, map_len - (offset - map_start)); - crc = btrfs_csum_data(root, kaddr + offset - map_start, + crc = btrfs_csum_data(kaddr + offset - map_start, crc, cur_len); len -= cur_len; offset += cur_len; @@ -354,6 +357,49 @@ out: } /* + * Return 0 if the superblock checksum type matches the checksum value of that + * algorithm. Pass the raw disk superblock data. + */ +static int btrfs_check_super_csum(char *raw_disk_sb) +{ + struct btrfs_super_block *disk_sb = + (struct btrfs_super_block *)raw_disk_sb; + u16 csum_type = btrfs_super_csum_type(disk_sb); + int ret = 0; + + if (csum_type == BTRFS_CSUM_TYPE_CRC32) { + u32 crc = ~(u32)0; + const int csum_size = sizeof(crc); + char result[csum_size]; + + /* + * The super_block structure does not span the whole + * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space + * is filled with zeros and is included in the checkum. + */ + crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, + crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, result); + + if (memcmp(raw_disk_sb, result, csum_size)) + ret = 1; + + if (ret && btrfs_super_generation(disk_sb) < 10) { + printk(KERN_WARNING "btrfs: super block crcs don't match, older mkfs detected\n"); + ret = 0; + } + } + + if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { + printk(KERN_ERR "btrfs: unsupported checksum algorithm %u\n", + csum_type); + ret = 1; + } + + return ret; +} + +/* * helper to read a given tree block, doing retries as required when * the checksums don't match and we have alternate mirrors to try. */ @@ -530,41 +576,6 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } -struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree, - struct page *page, int max_walk) -{ - struct extent_buffer *eb; - u64 start = page_offset(page); - u64 target = start; - u64 min_start; - - if (start < max_walk) - min_start = 0; - else - min_start = start - max_walk; - - while (start >= min_start) { - eb = find_extent_buffer(tree, start, 0); - if (eb) { - /* - * we found an extent buffer and it contains our page - * horray! - */ - if (eb->start <= target && - eb->start + eb->len > target) - return eb; - - /* we found an extent buffer that wasn't for us */ - free_extent_buffer(eb); - return NULL; - } - if (start == 0) - break; - start -= PAGE_CACHE_SIZE; - } - return NULL; -} - static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int mirror) { @@ -613,6 +624,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, goto err; } found_level = btrfs_header_level(eb); + if (found_level >= BTRFS_MAX_LEVEL) { + btrfs_info(root->fs_info, "bad tree block level %d\n", + (int)btrfs_header_level(eb)); + ret = -EIO; + goto err; + } btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, found_level); @@ -636,10 +653,9 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, if (!ret) set_extent_buffer_uptodate(eb); err: - if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { - clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + if (reads_done && + test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(root, eb, eb->start, ret); - } if (ret) { /* @@ -993,17 +1009,12 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) { if (PageWriteback(page) || PageDirty(page)) return 0; - /* - * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing - * slab allocation from alloc_extent_state down the callchain where - * it'd hit a BUG_ON as those flags are not allowed. - */ - gfp_flags &= ~GFP_SLAB_BUG_MASK; - return try_release_extent_buffer(page, gfp_flags); + return try_release_extent_buffer(page); } -static void btree_invalidatepage(struct page *page, unsigned long offset) +static void btree_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; @@ -1181,6 +1192,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->objectid = objectid; root->last_trans = 0; root->highest_objectid = 0; + root->nr_delalloc_inodes = 0; + root->nr_ordered_extents = 0; root->name = NULL; root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); @@ -1189,10 +1202,16 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD(&root->delalloc_inodes); + INIT_LIST_HEAD(&root->delalloc_root); + INIT_LIST_HEAD(&root->ordered_extents); + INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->logged_list[0]); INIT_LIST_HEAD(&root->logged_list[1]); spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); + spin_lock_init(&root->delalloc_lock); + spin_lock_init(&root->ordered_extent_lock); spin_lock_init(&root->accounting_lock); spin_lock_init(&root->log_extents_lock[0]); spin_lock_init(&root->log_extents_lock[1]); @@ -1206,6 +1225,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); + atomic_set(&root->refs, 1); root->log_transid = 0; root->last_log_commit = 0; extent_io_tree_init(&root->dirty_log_pages, @@ -1224,39 +1244,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, spin_lock_init(&root->root_item_lock); } -static int __must_check find_and_setup_root(struct btrfs_root *tree_root, - struct btrfs_fs_info *fs_info, - u64 objectid, - struct btrfs_root *root) -{ - int ret; - u32 blocksize; - u64 generation; - - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, objectid); - ret = btrfs_find_last_root(tree_root, objectid, - &root->root_item, &root->root_key); - if (ret > 0) - return -ENOENT; - else if (ret < 0) - return ret; - - generation = btrfs_root_generation(&root->root_item); - blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); - root->commit_root = NULL; - root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), - blocksize, generation); - if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) { - free_extent_buffer(root->node); - root->node = NULL; - return -EIO; - } - root->commit_root = btrfs_root_node(root); - return 0; -} - static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); @@ -1275,6 +1262,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_key key; int ret = 0; u64 bytenr; + uuid_le uuid; root = btrfs_alloc_root(fs_info); if (!root) @@ -1324,6 +1312,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_root_used(&root->root_item, leaf->len); btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); + uuid_le_gen(&uuid); + memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE); root->root_item.drop_level = 0; key.objectid = objectid; @@ -1438,63 +1428,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return 0; } -struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, - struct btrfs_key *location) +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_path *path; - struct extent_buffer *l; u64 generation; u32 blocksize; - int ret = 0; - int slot; + int ret; - root = btrfs_alloc_root(fs_info); - if (!root) + path = btrfs_alloc_path(); + if (!path) return ERR_PTR(-ENOMEM); - if (location->offset == (u64)-1) { - ret = find_and_setup_root(tree_root, fs_info, - location->objectid, root); - if (ret) { - kfree(root); - return ERR_PTR(ret); - } - goto out; + + root = btrfs_alloc_root(fs_info); + if (!root) { + ret = -ENOMEM; + goto alloc_fail; } __setup_root(tree_root->nodesize, tree_root->leafsize, tree_root->sectorsize, tree_root->stripesize, - root, fs_info, location->objectid); + root, fs_info, key->objectid); - path = btrfs_alloc_path(); - if (!path) { - kfree(root); - return ERR_PTR(-ENOMEM); - } - ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); - if (ret == 0) { - l = path->nodes[0]; - slot = path->slots[0]; - btrfs_read_root_item(tree_root, l, slot, &root->root_item); - memcpy(&root->root_key, location, sizeof(*location)); - } - btrfs_free_path(path); + ret = btrfs_find_root(tree_root, key, path, + &root->root_item, &root->root_key); if (ret) { - kfree(root); if (ret > 0) ret = -ENOENT; - return ERR_PTR(ret); + goto find_fail; } generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); + if (!root->node) { + ret = -ENOMEM; + goto find_fail; + } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { + ret = -EIO; + goto read_fail; + } root->commit_root = btrfs_root_node(root); - BUG_ON(!root->node); /* -ENOMEM */ out: - if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { + btrfs_free_path(path); + return root; + +read_fail: + free_extent_buffer(root->node); +find_fail: + kfree(root); +alloc_fail: + root = ERR_PTR(ret); + goto out; +} + +struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, + struct btrfs_key *location) +{ + struct btrfs_root *root; + + root = btrfs_read_tree_root(tree_root, location); + if (IS_ERR(root)) + return root; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { root->ref_cows = 1; btrfs_check_and_init_root_item(&root->root_item); } @@ -1502,6 +1502,66 @@ out: return root; } +int btrfs_init_fs_root(struct btrfs_root *root) +{ + int ret; + + root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); + root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), + GFP_NOFS); + if (!root->free_ino_pinned || !root->free_ino_ctl) { + ret = -ENOMEM; + goto fail; + } + + btrfs_init_free_ino_ctl(root); + mutex_init(&root->fs_commit_mutex); + spin_lock_init(&root->cache_lock); + init_waitqueue_head(&root->cache_wait); + + ret = get_anon_bdev(&root->anon_dev); + if (ret) + goto fail; + return 0; +fail: + kfree(root->free_ino_ctl); + kfree(root->free_ino_pinned); + return ret; +} + +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) +{ + struct btrfs_root *root; + + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_id); + spin_unlock(&fs_info->fs_roots_radix_lock); + return root; +} + +int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root) +{ + int ret; + + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + return ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); + if (ret == 0) + root->in_radix = 1; + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); + + return ret; +} + struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, struct btrfs_key *location) { @@ -1522,58 +1582,30 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, return fs_info->quota_root ? fs_info->quota_root : ERR_PTR(-ENOENT); again: - spin_lock(&fs_info->fs_roots_radix_lock); - root = radix_tree_lookup(&fs_info->fs_roots_radix, - (unsigned long)location->objectid); - spin_unlock(&fs_info->fs_roots_radix_lock); + root = btrfs_lookup_fs_root(fs_info, location->objectid); if (root) return root; - root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); + root = btrfs_read_fs_root(fs_info->tree_root, location); if (IS_ERR(root)) return root; - root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); - root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), - GFP_NOFS); - if (!root->free_ino_pinned || !root->free_ino_ctl) { - ret = -ENOMEM; + if (btrfs_root_refs(&root->root_item) == 0) { + ret = -ENOENT; goto fail; } - btrfs_init_free_ino_ctl(root); - mutex_init(&root->fs_commit_mutex); - spin_lock_init(&root->cache_lock); - init_waitqueue_head(&root->cache_wait); - - ret = get_anon_bdev(&root->anon_dev); + ret = btrfs_init_fs_root(root); if (ret) goto fail; - if (btrfs_root_refs(&root->root_item) == 0) { - ret = -ENOENT; - goto fail; - } - ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); if (ret < 0) goto fail; if (ret == 0) root->orphan_item_inserted = 1; - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); - if (ret) - goto fail; - - spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_insert(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - root); - if (ret == 0) - root->in_radix = 1; - - spin_unlock(&fs_info->fs_roots_radix_lock); - radix_tree_preload_end(); + ret = btrfs_insert_fs_root(fs_info, root); if (ret) { if (ret == -EEXIST) { free_fs_root(root); @@ -1581,10 +1613,6 @@ again: } goto fail; } - - ret = btrfs_find_dead_roots(fs_info->tree_root, - root->root_key.objectid); - WARN_ON(ret); return root; fail: free_fs_root(root); @@ -1656,17 +1684,38 @@ static void end_workqueue_fn(struct btrfs_work *work) static int cleaner_kthread(void *arg) { struct btrfs_root *root = arg; + int again; do { - if (!(root->fs_info->sb->s_flags & MS_RDONLY) && - mutex_trylock(&root->fs_info->cleaner_mutex)) { - btrfs_run_delayed_iputs(root); - btrfs_clean_old_snapshots(root); + again = 0; + + /* Make the cleaner go to sleep early. */ + if (btrfs_need_cleaner_sleep(root)) + goto sleep; + + if (!mutex_trylock(&root->fs_info->cleaner_mutex)) + goto sleep; + + /* + * Avoid the problem that we change the status of the fs + * during the above check and trylock. + */ + if (btrfs_need_cleaner_sleep(root)) { mutex_unlock(&root->fs_info->cleaner_mutex); - btrfs_run_defrag_inodes(root->fs_info); + goto sleep; } - if (!try_to_freeze()) { + btrfs_run_delayed_iputs(root); + again = btrfs_clean_one_deleted_snapshot(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + /* + * The defragger has dealt with the R/O remount and umount, + * needn't do anything special here. + */ + btrfs_run_defrag_inodes(root->fs_info); +sleep: + if (!try_to_freeze() && !again) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop()) schedule(); @@ -1699,7 +1748,7 @@ static int transaction_kthread(void *arg) } now = get_seconds(); - if (!cur->blocked && + if (cur->state < TRANS_STATE_BLOCKED && (now < cur->start_time || now - cur->start_time < 30)) { spin_unlock(&root->fs_info->trans_lock); delay = HZ * 5; @@ -1935,35 +1984,60 @@ static noinline int next_root_backup(struct btrfs_fs_info *info, return 0; } +/* helper to cleanup workers */ +static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) +{ + btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_raid56_workers); + btrfs_stop_workers(&fs_info->rmw_workers); + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->endio_freespace_worker); + btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->delayed_workers); + btrfs_stop_workers(&fs_info->caching_workers); + btrfs_stop_workers(&fs_info->readahead_workers); + btrfs_stop_workers(&fs_info->flush_workers); + btrfs_stop_workers(&fs_info->qgroup_rescan_workers); +} + /* helper to cleanup tree roots */ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) { free_extent_buffer(info->tree_root->node); free_extent_buffer(info->tree_root->commit_root); - free_extent_buffer(info->dev_root->node); - free_extent_buffer(info->dev_root->commit_root); - free_extent_buffer(info->extent_root->node); - free_extent_buffer(info->extent_root->commit_root); - free_extent_buffer(info->csum_root->node); - free_extent_buffer(info->csum_root->commit_root); - if (info->quota_root) { - free_extent_buffer(info->quota_root->node); - free_extent_buffer(info->quota_root->commit_root); - } - info->tree_root->node = NULL; info->tree_root->commit_root = NULL; - info->dev_root->node = NULL; - info->dev_root->commit_root = NULL; - info->extent_root->node = NULL; - info->extent_root->commit_root = NULL; - info->csum_root->node = NULL; - info->csum_root->commit_root = NULL; + + if (info->dev_root) { + free_extent_buffer(info->dev_root->node); + free_extent_buffer(info->dev_root->commit_root); + info->dev_root->node = NULL; + info->dev_root->commit_root = NULL; + } + if (info->extent_root) { + free_extent_buffer(info->extent_root->node); + free_extent_buffer(info->extent_root->commit_root); + info->extent_root->node = NULL; + info->extent_root->commit_root = NULL; + } + if (info->csum_root) { + free_extent_buffer(info->csum_root->node); + free_extent_buffer(info->csum_root->commit_root); + info->csum_root->node = NULL; + info->csum_root->commit_root = NULL; + } if (info->quota_root) { + free_extent_buffer(info->quota_root->node); + free_extent_buffer(info->quota_root->commit_root); info->quota_root->node = NULL; info->quota_root->commit_root = NULL; } - if (chunk_root) { free_extent_buffer(info->chunk_root->node); free_extent_buffer(info->chunk_root->commit_root); @@ -1972,6 +2046,36 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) } } +static void del_fs_roots(struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *gang[8]; + int i; + + while (!list_empty(&fs_info->dead_roots)) { + gang[0] = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&gang[0]->root_list); + + if (gang[0]->in_radix) { + btrfs_drop_and_free_fs_root(fs_info, gang[0]); + } else { + free_extent_buffer(gang[0]->node); + free_extent_buffer(gang[0]->commit_root); + btrfs_put_fs_root(gang[0]); + } + } + + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + btrfs_drop_and_free_fs_root(fs_info, gang[i]); + } +} int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, @@ -2001,14 +2105,8 @@ int open_ctree(struct super_block *sb, int backup_index = 0; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); - extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info); - csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); - dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); - quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info); - - if (!tree_root || !extent_root || !csum_root || - !chunk_root || !dev_root || !quota_root) { + if (!tree_root || !chunk_root) { err = -ENOMEM; goto fail; } @@ -2051,15 +2149,16 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); - INIT_LIST_HEAD(&fs_info->delalloc_inodes); + INIT_LIST_HEAD(&fs_info->delalloc_roots); INIT_LIST_HEAD(&fs_info->caching_block_groups); - spin_lock_init(&fs_info->delalloc_lock); + spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->free_chunk_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); + spin_lock_init(&fs_info->super_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); seqlock_init(&fs_info->profiles_lock); @@ -2083,12 +2182,11 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->defrag_running, 0); - atomic_set(&fs_info->tree_mod_seq, 0); + atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; fs_info->max_inline = 8192 * 1024; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; - fs_info->trans_no_join = 0; fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; @@ -2099,8 +2197,8 @@ int open_ctree(struct super_block *sb, fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); - INIT_LIST_HEAD(&fs_info->ordered_extents); - spin_lock_init(&fs_info->ordered_extent_lock); + INIT_LIST_HEAD(&fs_info->ordered_roots); + spin_lock_init(&fs_info->ordered_root_lock); fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), GFP_NOFS); if (!fs_info->delayed_root) { @@ -2187,11 +2285,14 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->dev_replace.lock); spin_lock_init(&fs_info->qgroup_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); fs_info->qgroup_tree = RB_ROOT; INIT_LIST_HEAD(&fs_info->dirty_qgroups); fs_info->qgroup_seq = 1; fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; + fs_info->qgroup_ulist = NULL; + mutex_init(&fs_info->qgroup_rescan_lock); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -2211,12 +2312,31 @@ int open_ctree(struct super_block *sb, fs_info, BTRFS_ROOT_TREE_OBJECTID); invalidate_bdev(fs_devices->latest_bdev); + + /* + * Read super block and check the signature bytes only + */ bh = btrfs_read_dev_super(fs_devices->latest_bdev); if (!bh) { err = -EINVAL; goto fail_alloc; } + /* + * We want to check superblock checksum, the type is stored inside. + * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). + */ + if (btrfs_check_super_csum(bh->b_data)) { + printk(KERN_ERR "btrfs: superblock checksum mismatch\n"); + err = -EINVAL; + goto fail_alloc; + } + + /* + * super_copy is zeroed at allocation time and we never touch the + * following bytes up to INFO_SIZE, the checksum is calculated from + * the whole block of INFO_SIZE + */ memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); memcpy(fs_info->super_for_commit, fs_info->super_copy, sizeof(*fs_info->super_for_commit)); @@ -2224,6 +2344,13 @@ int open_ctree(struct super_block *sb, memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); + ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + if (ret) { + printk(KERN_ERR "btrfs: superblock contains fatal errors\n"); + err = -EINVAL; + goto fail_alloc; + } + disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_alloc; @@ -2232,13 +2359,6 @@ int open_ctree(struct super_block *sb, if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); - ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); - if (ret) { - printk(KERN_ERR "btrfs: superblock contains fatal errors\n"); - err = ret; - goto fail_alloc; - } - /* * run through our array of backup supers and setup * our ring pointer to the oldest one @@ -2290,6 +2410,9 @@ int open_ctree(struct super_block *sb, if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) + printk(KERN_ERR "btrfs: has skinny extents\n"); + /* * flag our filesystem as having big metadata blocks if * they are bigger than the page size @@ -2319,6 +2442,10 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } + /* + * Needn't use the lock because there is no other task which will + * update the flag. + */ btrfs_set_super_incompat_flags(disk_super, features); features = btrfs_super_compat_ro_flags(disk_super) & @@ -2394,6 +2521,8 @@ int open_ctree(struct super_block *sb, btrfs_init_workers(&fs_info->readahead_workers, "readahead", fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -2428,6 +2557,7 @@ int open_ctree(struct super_block *sb, ret |= btrfs_start_workers(&fs_info->caching_workers); ret |= btrfs_start_workers(&fs_info->readahead_workers); ret |= btrfs_start_workers(&fs_info->flush_workers); + ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers); if (ret) { err = -ENOMEM; goto fail_sb_buffer; @@ -2475,8 +2605,8 @@ int open_ctree(struct super_block *sb, chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), blocksize, generation); - BUG_ON(!chunk_root->node); /* -ENOMEM */ - if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + if (!chunk_root->node || + !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", sb->s_id); goto fail_tree_roots; @@ -2526,33 +2656,44 @@ retry_root_backup: btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_EXTENT_TREE_OBJECTID, extent_root); - if (ret) + location.objectid = BTRFS_EXTENT_TREE_OBJECTID; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = 0; + + extent_root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(extent_root)) { + ret = PTR_ERR(extent_root); goto recovery_tree_root; + } extent_root->track_dirty = 1; + fs_info->extent_root = extent_root; - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_DEV_TREE_OBJECTID, dev_root); - if (ret) + location.objectid = BTRFS_DEV_TREE_OBJECTID; + dev_root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(dev_root)) { + ret = PTR_ERR(dev_root); goto recovery_tree_root; + } dev_root->track_dirty = 1; + fs_info->dev_root = dev_root; + btrfs_init_devices_late(fs_info); - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_CSUM_TREE_OBJECTID, csum_root); - if (ret) + location.objectid = BTRFS_CSUM_TREE_OBJECTID; + csum_root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(csum_root)) { + ret = PTR_ERR(csum_root); goto recovery_tree_root; + } csum_root->track_dirty = 1; + fs_info->csum_root = csum_root; - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_QUOTA_TREE_OBJECTID, quota_root); - if (ret) { - kfree(quota_root); - quota_root = fs_info->quota_root = NULL; - } else { + location.objectid = BTRFS_QUOTA_TREE_OBJECTID; + quota_root = btrfs_read_tree_root(tree_root, &location); + if (!IS_ERR(quota_root)) { quota_root->track_dirty = 1; fs_info->quota_enabled = 1; fs_info->pending_quota_state = 1; + fs_info->quota_root = quota_root; } fs_info->generation = generation; @@ -2661,6 +2802,13 @@ retry_root_backup: log_tree_root->node = read_tree_block(tree_root, bytenr, blocksize, generation + 1); + if (!log_tree_root->node || + !extent_buffer_uptodate(log_tree_root->node)) { + printk(KERN_ERR "btrfs: failed to read log tree\n"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + goto fail_trans_kthread; + } /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { @@ -2698,11 +2846,9 @@ retry_root_backup: location.objectid = BTRFS_FS_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; + location.offset = 0; fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); - if (!fs_info->fs_root) - goto fail_qgroup; if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); goto fail_qgroup; @@ -2734,12 +2880,16 @@ retry_root_backup: return ret; } + btrfs_qgroup_rescan_resume(fs_info); + return 0; fail_qgroup: btrfs_free_qgroup_config(fs_info); fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); + btrfs_cleanup_transaction(fs_info->tree_root); + del_fs_roots(fs_info); fail_cleaner: kthread_stop(fs_info->cleaner_kthread); @@ -2750,6 +2900,7 @@ fail_cleaner: filemap_write_and_wait(fs_info->btree_inode->i_mapping); fail_block_groups: + btrfs_put_block_group_cache(fs_info); btrfs_free_block_groups(fs_info); fail_tree_roots: @@ -2757,22 +2908,7 @@ fail_tree_roots: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); fail_sb_buffer: - btrfs_stop_workers(&fs_info->generic_worker); - btrfs_stop_workers(&fs_info->readahead_workers); - btrfs_stop_workers(&fs_info->fixup_workers); - btrfs_stop_workers(&fs_info->delalloc_workers); - btrfs_stop_workers(&fs_info->workers); - btrfs_stop_workers(&fs_info->endio_workers); - btrfs_stop_workers(&fs_info->endio_meta_workers); - btrfs_stop_workers(&fs_info->endio_raid56_workers); - btrfs_stop_workers(&fs_info->rmw_workers); - btrfs_stop_workers(&fs_info->endio_meta_write_workers); - btrfs_stop_workers(&fs_info->endio_write_workers); - btrfs_stop_workers(&fs_info->endio_freespace_worker); - btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->delayed_workers); - btrfs_stop_workers(&fs_info->caching_workers); - btrfs_stop_workers(&fs_info->flush_workers); + btrfs_stop_all_workers(fs_info); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -2904,7 +3040,10 @@ static int write_dev_supers(struct btrfs_device *device, if (wait) { bh = __find_get_block(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); - BUG_ON(!bh); + if (!bh) { + errors++; + continue; + } wait_on_buffer(bh); if (!buffer_uptodate(bh)) errors++; @@ -2919,7 +3058,7 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_set_super_bytenr(sb, bytenr); crc = ~(u32)0; - crc = btrfs_csum_data(NULL, (char *)sb + + crc = btrfs_csum_data((char *)sb + BTRFS_CSUM_SIZE, crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); @@ -2931,6 +3070,13 @@ static int write_dev_supers(struct btrfs_device *device, */ bh = __getblk(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); + if (!bh) { + printk(KERN_ERR "btrfs: couldn't get super " + "buffer head for bytenr %Lu\n", bytenr); + errors++; + continue; + } + memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); /* one reference for submit_bh */ @@ -3013,7 +3159,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) * caller */ device->flush_bio = NULL; - bio = bio_alloc(GFP_NOFS, 0); + bio = btrfs_io_bio_alloc(GFP_NOFS, 0); if (!bio) return -ENOMEM; @@ -3141,7 +3287,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( BTRFS_BLOCK_GROUP_RAID10)) { num_tolerated_disk_barrier_failures = 1; } else if (flags & - BTRFS_BLOCK_GROUP_RAID5) { + BTRFS_BLOCK_GROUP_RAID6) { num_tolerated_disk_barrier_failures = 2; } } @@ -3153,7 +3299,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( return num_tolerated_disk_barrier_failures; } -int write_all_supers(struct btrfs_root *root, int max_mirrors) +static int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; struct btrfs_device *dev; @@ -3249,7 +3395,9 @@ int write_ctree_super(struct btrfs_trans_handle *trans, return ret; } -void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +/* Drop a fs root from the radix tree and free it. */ +void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root) { spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, @@ -3280,38 +3428,12 @@ static void free_fs_root(struct btrfs_root *root) kfree(root->free_ino_ctl); kfree(root->free_ino_pinned); kfree(root->name); - kfree(root); + btrfs_put_fs_root(root); } -static void del_fs_roots(struct btrfs_fs_info *fs_info) +void btrfs_free_fs_root(struct btrfs_root *root) { - int ret; - struct btrfs_root *gang[8]; - int i; - - while (!list_empty(&fs_info->dead_roots)) { - gang[0] = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); - list_del(&gang[0]->root_list); - - if (gang[0]->in_radix) { - btrfs_free_fs_root(fs_info, gang[0]); - } else { - free_extent_buffer(gang[0]->node); - free_extent_buffer(gang[0]->commit_root); - kfree(gang[0]); - } - } - - while (1) { - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, 0, - ARRAY_SIZE(gang)); - if (!ret) - break; - for (i = 0; i < ret; i++) - btrfs_free_fs_root(fs_info, gang[i]); - } + free_fs_root(root); } int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) @@ -3349,8 +3471,8 @@ int btrfs_commit_super(struct btrfs_root *root) mutex_lock(&root->fs_info->cleaner_mutex); btrfs_run_delayed_iputs(root); - btrfs_clean_old_snapshots(root); mutex_unlock(&root->fs_info->cleaner_mutex); + wake_up_process(root->fs_info->cleaner_kthread); /* wait until ongoing cleanup work done */ down_write(&root->fs_info->cleanup_work_sem); @@ -3426,43 +3548,15 @@ int close_ctree(struct btrfs_root *root) percpu_counter_sum(&fs_info->delalloc_bytes)); } - free_extent_buffer(fs_info->extent_root->node); - free_extent_buffer(fs_info->extent_root->commit_root); - free_extent_buffer(fs_info->tree_root->node); - free_extent_buffer(fs_info->tree_root->commit_root); - free_extent_buffer(fs_info->chunk_root->node); - free_extent_buffer(fs_info->chunk_root->commit_root); - free_extent_buffer(fs_info->dev_root->node); - free_extent_buffer(fs_info->dev_root->commit_root); - free_extent_buffer(fs_info->csum_root->node); - free_extent_buffer(fs_info->csum_root->commit_root); - if (fs_info->quota_root) { - free_extent_buffer(fs_info->quota_root->node); - free_extent_buffer(fs_info->quota_root->commit_root); - } - btrfs_free_block_groups(fs_info); + btrfs_stop_all_workers(fs_info); + del_fs_roots(fs_info); - iput(fs_info->btree_inode); + free_root_pointers(fs_info, 1); - btrfs_stop_workers(&fs_info->generic_worker); - btrfs_stop_workers(&fs_info->fixup_workers); - btrfs_stop_workers(&fs_info->delalloc_workers); - btrfs_stop_workers(&fs_info->workers); - btrfs_stop_workers(&fs_info->endio_workers); - btrfs_stop_workers(&fs_info->endio_meta_workers); - btrfs_stop_workers(&fs_info->endio_raid56_workers); - btrfs_stop_workers(&fs_info->rmw_workers); - btrfs_stop_workers(&fs_info->endio_meta_write_workers); - btrfs_stop_workers(&fs_info->endio_write_workers); - btrfs_stop_workers(&fs_info->endio_freespace_worker); - btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->delayed_workers); - btrfs_stop_workers(&fs_info->caching_workers); - btrfs_stop_workers(&fs_info->readahead_workers); - btrfs_stop_workers(&fs_info->flush_workers); + iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(root, CHECK_INTEGRITY)) @@ -3567,18 +3661,13 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { - if (btrfs_super_csum_type(fs_info->super_copy) >= ARRAY_SIZE(btrfs_csum_sizes)) { - printk(KERN_ERR "btrfs: unsupported checksum algorithm\n"); - return -EINVAL; - } - - if (read_only) - return 0; - + /* + * Placeholder for checks + */ return 0; } -void btrfs_error_commit_super(struct btrfs_root *root) +static void btrfs_error_commit_super(struct btrfs_root *root) { mutex_lock(&root->fs_info->cleaner_mutex); btrfs_run_delayed_iputs(root); @@ -3600,7 +3689,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, INIT_LIST_HEAD(&splice); mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); list_splice_init(&t->ordered_operations, &splice); while (!list_empty(&splice)) { @@ -3608,11 +3697,14 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, ordered_operations); list_del_init(&btrfs_inode->ordered_operations); + spin_unlock(&root->fs_info->ordered_root_lock); btrfs_invalidate_inodes(btrfs_inode->root); + + spin_lock(&root->fs_info->ordered_root_lock); } - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); mutex_unlock(&root->fs_info->ordered_operations_mutex); } @@ -3620,15 +3712,36 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { struct btrfs_ordered_extent *ordered; - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->ordered_extent_lock); /* * This will just short circuit the ordered completion stuff which will * make sure the ordered extent gets properly cleaned up. */ - list_for_each_entry(ordered, &root->fs_info->ordered_extents, + list_for_each_entry(ordered, &root->ordered_extents, root_extent_list) set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->ordered_extent_lock); +} + +static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->ordered_root_lock); + list_splice_init(&fs_info->ordered_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + ordered_root); + list_del_init(&root->ordered_root); + + btrfs_destroy_ordered_extents(root); + + cond_resched_lock(&fs_info->ordered_root_lock); + } + spin_unlock(&fs_info->ordered_root_lock); } int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, @@ -3650,6 +3763,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((node = rb_first(&delayed_refs->root)) != NULL) { struct btrfs_delayed_ref_head *head = NULL; + bool pin_bytes = false; ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); atomic_set(&ref->refs, 1); @@ -3669,6 +3783,8 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } + if (head->must_insert_reserved) + pin_bytes = true; btrfs_free_delayed_extent_op(head->extent_op); delayed_refs->num_heads--; if (list_empty(&head->cluster)) @@ -3679,9 +3795,13 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - if (head) - mutex_unlock(&head->mutex); spin_unlock(&delayed_refs->lock); + if (head) { + if (pin_bytes) + btrfs_pin_extent(root, ref->bytenr, + ref->num_bytes, 1); + mutex_unlock(&head->mutex); + } btrfs_put_delayed_ref(ref); cond_resched(); @@ -3718,21 +3838,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) INIT_LIST_HEAD(&splice); - spin_lock(&root->fs_info->delalloc_lock); - list_splice_init(&root->fs_info->delalloc_inodes, &splice); + spin_lock(&root->delalloc_lock); + list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, - delalloc_inodes); + btrfs_inode = list_first_entry(&splice, struct btrfs_inode, + delalloc_inodes); list_del_init(&btrfs_inode->delalloc_inodes); clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, &btrfs_inode->runtime_flags); + spin_unlock(&root->delalloc_lock); btrfs_invalidate_inodes(btrfs_inode->root); + + spin_lock(&root->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&root->delalloc_lock); +} + +static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->delalloc_root_lock); + list_splice_init(&fs_info->delalloc_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + delalloc_root); + list_del_init(&root->delalloc_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + spin_unlock(&fs_info->delalloc_root_lock); + + btrfs_destroy_delalloc_inodes(root); + btrfs_put_fs_root(root); + + spin_lock(&fs_info->delalloc_root_lock); + } + spin_unlock(&fs_info->delalloc_root_lock); } static int btrfs_destroy_marked_extents(struct btrfs_root *root, @@ -3740,13 +3888,9 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, int mark) { int ret; - struct page *page; - struct inode *btree_inode = root->fs_info->btree_inode; struct extent_buffer *eb; u64 start = 0; u64 end; - u64 offset; - unsigned long index; while (1) { ret = find_first_extent_bit(dirty_pages, start, &start, &end, @@ -3756,36 +3900,17 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); while (start <= end) { - index = start >> PAGE_CACHE_SHIFT; - start = (u64)(index + 1) << PAGE_CACHE_SHIFT; - page = find_get_page(btree_inode->i_mapping, index); - if (!page) + eb = btrfs_find_tree_block(root, start, + root->leafsize); + start += root->leafsize; + if (!eb) continue; - offset = page_offset(page); - - spin_lock(&dirty_pages->buffer_lock); - eb = radix_tree_lookup( - &(&BTRFS_I(page->mapping->host)->io_tree)->buffer, - offset >> PAGE_CACHE_SHIFT); - spin_unlock(&dirty_pages->buffer_lock); - if (eb) - ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY, - &eb->bflags); - if (PageWriteback(page)) - end_page_writeback(page); - - lock_page(page); - if (PageDirty(page)) { - clear_page_dirty_for_io(page); - spin_lock_irq(&page->mapping->tree_lock); - radix_tree_tag_clear(&page->mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&page->mapping->tree_lock); - } + wait_on_extent_buffer_writeback(eb); - unlock_page(page); - page_cache_release(page); + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, + &eb->bflags)) + clear_extent_buffer_dirty(eb); + free_extent_buffer_stale(eb); } } @@ -3839,19 +3964,14 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, cur_trans->dirty_pages.dirty_bytes); - /* FIXME: cleanup wait for commit */ - cur_trans->in_commit = 1; - cur_trans->blocked = 1; + cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&root->fs_info->transaction_blocked_wait); btrfs_evict_pending_snapshots(cur_trans); - cur_trans->blocked = 0; + cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&root->fs_info->transaction_wait); - cur_trans->commit_done = 1; - wake_up(&cur_trans->commit_wait); - btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); @@ -3860,13 +3980,16 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents); + cur_trans->state =TRANS_STATE_COMPLETED; + wake_up(&cur_trans->commit_wait); + /* memset(cur_trans, 0, sizeof(*cur_trans)); kmem_cache_free(btrfs_transaction_cachep, cur_trans); */ } -int btrfs_cleanup_transaction(struct btrfs_root *root) +static int btrfs_cleanup_transaction(struct btrfs_root *root) { struct btrfs_transaction *t; LIST_HEAD(list); @@ -3875,7 +3998,7 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) spin_lock(&root->fs_info->trans_lock); list_splice_init(&root->fs_info->trans_list, &list); - root->fs_info->trans_no_join = 1; + root->fs_info->running_transaction = NULL; spin_unlock(&root->fs_info->trans_lock); while (!list_empty(&list)) { @@ -3883,41 +4006,31 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_ordered_operations(t, root); - btrfs_destroy_ordered_extents(root); + btrfs_destroy_all_ordered_extents(root->fs_info); btrfs_destroy_delayed_refs(t, root); - btrfs_block_rsv_release(root, - &root->fs_info->trans_block_rsv, - t->dirty_pages.dirty_bytes); - - /* FIXME: cleanup wait for commit */ - t->in_commit = 1; - t->blocked = 1; + /* + * FIXME: cleanup wait for commit + * We needn't acquire the lock here, because we are during + * the umount, there is no other task which will change it. + */ + t->state = TRANS_STATE_COMMIT_START; smp_mb(); if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) wake_up(&root->fs_info->transaction_blocked_wait); btrfs_evict_pending_snapshots(t); - t->blocked = 0; + t->state = TRANS_STATE_UNBLOCKED; smp_mb(); if (waitqueue_active(&root->fs_info->transaction_wait)) wake_up(&root->fs_info->transaction_wait); - t->commit_done = 1; - smp_mb(); - if (waitqueue_active(&t->commit_wait)) - wake_up(&t->commit_wait); - btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); - btrfs_destroy_delalloc_inodes(root); - - spin_lock(&root->fs_info->trans_lock); - root->fs_info->running_transaction = NULL; - spin_unlock(&root->fs_info->trans_lock); + btrfs_destroy_all_delalloc_inodes(root->fs_info); btrfs_destroy_marked_extents(root, &t->dirty_pages, EXTENT_DIRTY); @@ -3925,15 +4038,17 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents); + t->state = TRANS_STATE_COMPLETED; + smp_mb(); + if (waitqueue_active(&t->commit_wait)) + wake_up(&t->commit_wait); + atomic_set(&t->use_count, 0); list_del_init(&t->list); memset(t, 0, sizeof(*t)); kmem_cache_free(btrfs_transaction_cachep, t); } - spin_lock(&root->fs_info->trans_lock); - root->fs_info->trans_no_join = 0; - spin_unlock(&root->fs_info->trans_lock); mutex_unlock(&root->fs_info->transaction_kthread_mutex); return 0; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 034d7dc552b2..b71acd6e1e5b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -61,23 +61,48 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); -void btrfs_error_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); -struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, - struct btrfs_key *location); +struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, + struct btrfs_key *location); +int btrfs_init_fs_root(struct btrfs_root *root); +int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root); struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, struct btrfs_key *location); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_root *root); void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); -void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); +void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root); +void btrfs_free_fs_root(struct btrfs_root *root); + +/* + * This function is used to grab the root, and avoid it is freed when we + * access it. But it doesn't ensure that the tree is not dropped. + * + * If you want to ensure the whole tree is safe, you should use + * fs_info->subvol_srcu + */ +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +{ + if (atomic_inc_not_zero(&root->refs)) + return root; + return NULL; +} + +static inline void btrfs_put_fs_root(struct btrfs_root *root) +{ + if (atomic_dec_and_test(&root->refs)) + kfree(root); +} + void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_set_buffer_uptodate(struct extent_buffer *buf); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); -u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len); +u32 btrfs_csum_data(char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, char *result); int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata); @@ -93,10 +118,8 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_cleanup_transaction(struct btrfs_root *root); void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, struct btrfs_root *root); -void btrfs_abort_devices(struct btrfs_root *root); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 objectid); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 81ee29eeb7ca..4b8691607373 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -82,11 +82,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, goto fail; } - if (btrfs_root_refs(&root->root_item) == 0) { - err = -ENOENT; - goto fail; - } - key.objectid = objectid; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3d551231caba..0236de711989 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -24,6 +24,7 @@ #include <linux/kthread.h> #include <linux/slab.h> #include <linux/ratelimit.h> +#include <linux/percpu_counter.h> #include "compat.h" #include "hash.h" #include "ctree.h" @@ -105,6 +106,8 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, u64 num_bytes, int reserve); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -270,9 +273,27 @@ static int exclude_super_stripes(struct btrfs_root *root, return ret; while (nr--) { - cache->bytes_super += stripe_len; - ret = add_excluded_extent(root, logical[nr], - stripe_len); + u64 start, len; + + if (logical[nr] > cache->key.objectid + + cache->key.offset) + continue; + + if (logical[nr] + stripe_len <= cache->key.objectid) + continue; + + start = logical[nr]; + if (start < cache->key.objectid) { + start = cache->key.objectid; + len = (logical[nr] + stripe_len) - start; + } else { + len = min_t(u64, stripe_len, + cache->key.objectid + + cache->key.offset - start); + } + + cache->bytes_super += len; + ret = add_excluded_extent(root, start, len); if (ret) { kfree(logical); return ret; @@ -419,8 +440,7 @@ again: if (ret) break; - if (need_resched() || - btrfs_next_leaf(extent_root, path)) { + if (need_resched()) { caching_ctl->progress = last; btrfs_release_path(path); up_read(&fs_info->extent_commit_sem); @@ -428,6 +448,12 @@ again: cond_resched(); goto again; } + + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto err; + if (ret) + break; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); continue; @@ -442,11 +468,16 @@ again: block_group->key.offset) break; - if (key.type == BTRFS_EXTENT_ITEM_KEY) { + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { total_found += add_new_free_space(block_group, fs_info, last, key.objectid); - last = key.objectid + key.offset; + if (key.type == BTRFS_METADATA_ITEM_KEY) + last = key.objectid + + fs_info->tree_root->leafsize; + else + last = key.objectid + key.offset; if (total_found > (1024 * 1024 * 2)) { total_found = 0; @@ -656,55 +687,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) rcu_read_unlock(); } -u64 btrfs_find_block_group(struct btrfs_root *root, - u64 search_start, u64 search_hint, int owner) -{ - struct btrfs_block_group_cache *cache; - u64 used; - u64 last = max(search_hint, search_start); - u64 group_start = 0; - int full_search = 0; - int factor = 9; - int wrapped = 0; -again: - while (1) { - cache = btrfs_lookup_first_block_group(root->fs_info, last); - if (!cache) - break; - - spin_lock(&cache->lock); - last = cache->key.objectid + cache->key.offset; - used = btrfs_block_group_used(&cache->item); - - if ((full_search || !cache->ro) && - block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { - if (used + cache->pinned + cache->reserved < - div_factor(cache->key.offset, factor)) { - group_start = cache->key.objectid; - spin_unlock(&cache->lock); - btrfs_put_block_group(cache); - goto found; - } - } - spin_unlock(&cache->lock); - btrfs_put_block_group(cache); - cond_resched(); - } - if (!wrapped) { - last = search_start; - wrapped = 1; - goto again; - } - if (!full_search && factor < 10) { - last = search_start; - full_search = 1; - factor = 10; - goto again; - } -found: - return group_start; -} - /* simple helper to search for an existing extent at a given offset */ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) { @@ -718,15 +700,21 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) key.objectid = start; key.offset = len; - btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + key.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 0, 0); + if (ret > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == start && + key.type == BTRFS_METADATA_ITEM_KEY) + ret = 0; + } btrfs_free_path(path); return ret; } /* - * helper function to lookup reference count and flags of extent. + * helper function to lookup reference count and flags of a tree block. * * the head node for delayed ref is used to store the sum of all the * reference count modifications queued up in the rbtree. the head @@ -736,7 +724,7 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) */ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags) + u64 offset, int metadata, u64 *refs, u64 *flags) { struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; @@ -749,13 +737,29 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u64 extent_flags; int ret; + /* + * If we don't have skinny metadata, don't bother doing anything + * different + */ + if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { + offset = root->leafsize; + metadata = 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; + if (metadata) { + key.objectid = bytenr; + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = offset; + } else { + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = offset; + } + if (!trans) { path->skip_locking = 1; path->search_commit_root = 1; @@ -766,6 +770,13 @@ again: if (ret < 0) goto out_free; + if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = root->leafsize; + btrfs_release_path(path); + goto again; + } + if (ret == 0) { leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -1001,7 +1012,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, return ret; BUG_ON(ret); /* Corruption */ - btrfs_extend_item(trans, root, path, new_size); + btrfs_extend_item(root, path, new_size); leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -1453,6 +1464,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, int want; int ret; int err = 0; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -1464,11 +1477,46 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, path->keep_locks = 1; } else extra_size = -1; + + /* + * Owner is our parent level, so we can just add one to get the level + * for the block we are interested in. + */ + if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = owner; + } + +again: ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); if (ret < 0) { err = ret; goto out; } + + /* + * We may be a newly converted file system which still has the old fat + * extent entries for metadata, so try and see if we have one of those. + */ + if (ret > 0 && skinny_metadata) { + skinny_metadata = false; + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) + ret = 0; + } + if (ret) { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + btrfs_release_path(path); + goto again; + } + } + if (ret && !insert) { err = -ENOENT; goto out; @@ -1504,11 +1552,9 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ei + 1); end = (unsigned long)ei + item_size; - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { ptr += sizeof(struct btrfs_tree_block_info); BUG_ON(ptr > end); - } else { - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); } err = -ENOENT; @@ -1590,8 +1636,7 @@ out: * helper to add new inline back ref */ static noinline_for_stack -void setup_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +void setup_inline_extent_backref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, u64 parent, u64 root_objectid, @@ -1614,7 +1659,7 @@ void setup_inline_extent_backref(struct btrfs_trans_handle *trans, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - btrfs_extend_item(trans, root, path, size); + btrfs_extend_item(root, path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1683,8 +1728,7 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, * helper to update/remove inline back ref */ static noinline_for_stack -void update_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +void update_inline_extent_backref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, @@ -1740,7 +1784,7 @@ void update_inline_extent_backref(struct btrfs_trans_handle *trans, memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - btrfs_truncate_item(trans, root, path, item_size, 1); + btrfs_truncate_item(root, path, item_size, 1); } btrfs_mark_buffer_dirty(leaf); } @@ -1762,10 +1806,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, root_objectid, owner, offset, 1); if (ret == 0) { BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); - update_inline_extent_backref(trans, root, path, iref, + update_inline_extent_backref(root, path, iref, refs_to_add, extent_op); } else if (ret == -ENOENT) { - setup_inline_extent_backref(trans, root, path, iref, parent, + setup_inline_extent_backref(root, path, iref, parent, root_objectid, owner, offset, refs_to_add, extent_op); ret = 0; @@ -1802,7 +1846,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, BUG_ON(!is_data && refs_to_drop != 1); if (iref) { - update_inline_extent_backref(trans, root, path, iref, + update_inline_extent_backref(root, path, iref, -refs_to_drop, NULL); } else if (is_data) { ret = remove_extent_data_ref(trans, root, path, refs_to_drop); @@ -1973,10 +2017,8 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ref_root = ref->root; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - if (extent_op) { - BUG_ON(extent_op->update_key); + if (extent_op) flags |= extent_op->flags_to_set; - } ret = alloc_reserved_file_extent(trans, root, parent, ref_root, flags, ref->objectid, ref->offset, @@ -2029,18 +2071,29 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, u32 item_size; int ret; int err = 0; + int metadata = !extent_op->is_data; if (trans->aborted) return 0; + if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) + metadata = 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = node->bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = node->num_bytes; + if (metadata) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = extent_op->level; + } else { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = node->num_bytes; + } + +again: path->reada = 1; path->leave_spinning = 1; ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, @@ -2050,6 +2103,14 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, goto out; } if (ret > 0) { + if (metadata) { + btrfs_release_path(path); + metadata = 0; + + key.offset = node->num_bytes; + key.type = BTRFS_EXTENT_ITEM_KEY; + goto again; + } err = -EIO; goto out; } @@ -2089,10 +2150,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_key ins; u64 parent = 0; u64 ref_root = 0; - - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) @@ -2100,10 +2159,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, else ref_root = ref->root; + ins.objectid = node->bytenr; + if (skinny_metadata) { + ins.offset = ref->level; + ins.type = BTRFS_METADATA_ITEM_KEY; + } else { + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + } + BUG_ON(node->ref_mod != 1); if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - BUG_ON(!extent_op || !extent_op->update_flags || - !extent_op->update_key); + BUG_ON(!extent_op || !extent_op->update_flags); ret = alloc_reserved_tree_block(trans, root, parent, ref_root, extent_op->flags_to_set, @@ -2307,9 +2374,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, btrfs_free_delayed_extent_op(extent_op); if (ret) { - printk(KERN_DEBUG - "btrfs: run_delayed_extent_op " - "returned %d\n", ret); + btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); spin_lock(&delayed_refs->lock); btrfs_delayed_ref_unlock(locked_ref); return ret; @@ -2348,8 +2413,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, if (ret) { btrfs_delayed_ref_unlock(locked_ref); btrfs_put_delayed_ref(ref); - printk(KERN_DEBUG - "btrfs: run_one_delayed_ref returned %d\n", ret); + btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); spin_lock(&delayed_refs->lock); return ret; } @@ -2426,9 +2490,11 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, if (list_empty(&trans->qgroup_ref_list) != !trans->delayed_ref_elem.seq) { /* list without seq or seq without list */ - printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n", + btrfs_err(fs_info, + "qgroup accounting update error, list is%s empty, seq is %#x.%x", list_empty(&trans->qgroup_ref_list) ? "" : " not", - trans->delayed_ref_elem.seq); + (u32)(trans->delayed_ref_elem.seq >> 32), + (u32)trans->delayed_ref_elem.seq); BUG(); } @@ -2461,6 +2527,51 @@ static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, return 0; } +static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) +{ + u64 num_bytes; + + num_bytes = heads * (sizeof(struct btrfs_extent_item) + + sizeof(struct btrfs_extent_inline_ref)); + if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) + num_bytes += heads * sizeof(struct btrfs_tree_block_info); + + /* + * We don't ever fill up leaves all the way so multiply by 2 just to be + * closer to what we're really going to want to ouse. + */ + return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); +} + +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_rsv *global_rsv; + u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; + u64 num_bytes; + int ret = 0; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + num_heads = heads_to_leaves(root, num_heads); + if (num_heads > 1) + num_bytes += (num_heads - 1) * root->leafsize; + num_bytes <<= 1; + global_rsv = &root->fs_info->global_block_rsv; + + /* + * If we can't allocate any more chunks lets make sure we have _lots_ of + * wiggle room since running delayed refs can create more delayed refs. + */ + if (global_rsv->space_info->full) + num_bytes <<= 1; + + spin_lock(&global_rsv->lock); + if (global_rsv->reserved <= num_bytes) + ret = 1; + spin_unlock(&global_rsv->lock); + return ret; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2508,7 +2619,8 @@ progress: old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); if (old) { DEFINE_WAIT(__wait); - if (delayed_refs->num_entries < 16348) + if (delayed_refs->flushing || + !btrfs_should_throttle_delayed_refs(trans, root)) return 0; prepare_to_wait(&delayed_refs->wait, &__wait, @@ -2543,7 +2655,7 @@ again: while (1) { if (!(run_all || run_most) && - delayed_refs->num_heads_ready < 64) + !btrfs_should_throttle_delayed_refs(trans, root)) break; /* @@ -2564,6 +2676,7 @@ again: spin_unlock(&delayed_refs->lock); btrfs_abort_transaction(trans, root, ret); atomic_dec(&delayed_refs->procs_running_refs); + wake_up(&delayed_refs->wait); return ret; } @@ -2650,7 +2763,7 @@ out: int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, - int is_data) + int level, int is_data) { struct btrfs_delayed_extent_op *extent_op; int ret; @@ -2663,6 +2776,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->update_flags = 1; extent_op->update_key = 0; extent_op->is_data = is_data ? 1 : 0; + extent_op->level = level; ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, num_bytes, extent_op); @@ -3040,6 +3154,11 @@ again: WARN_ON(ret); if (i_size_read(inode) > 0) { + ret = btrfs_check_trunc_cache_free_space(root, + &root->fs_info->global_block_rsv); + if (ret) + goto out_put; + ret = btrfs_truncate_free_space_cache(root, trans, path, inode); if (ret) @@ -3239,6 +3358,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, struct btrfs_space_info *found; int i; int factor; + int ret; if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -3262,6 +3382,12 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; + ret = percpu_counter_init(&found->total_bytes_pinned, 0); + if (ret) { + kfree(found); + return ret; + } + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); @@ -3337,7 +3463,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) * progress (either running or paused) picks the target profile (if it's * already available), otherwise falls back to plain reducing. */ -u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) +static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { /* * we add in the count of missing devices because we want @@ -3494,10 +3620,11 @@ alloc: } /* - * If we have less pinned bytes than we want to allocate then - * don't bother committing the transaction, it won't help us. + * If we don't have enough pinned space to deal with this + * allocation don't bother committing the transaction. */ - if (data_sinfo->bytes_pinned < bytes) + if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, + bytes) < 0) committed = 1; spin_unlock(&data_sinfo->lock); @@ -3506,6 +3633,7 @@ commit_trans: if (!committed && !atomic_read(&root->fs_info->open_ioctl_trans)) { committed = 1; + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3538,6 +3666,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); + WARN_ON(data_sinfo->bytes_may_use < bytes); data_sinfo->bytes_may_use -= bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", data_sinfo->flags, bytes, 0); @@ -3557,6 +3686,11 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_unlock(); } +static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) +{ + return (global->size << 1); +} + static int should_alloc_chunk(struct btrfs_root *root, struct btrfs_space_info *sinfo, int force) { @@ -3574,7 +3708,7 @@ static int should_alloc_chunk(struct btrfs_root *root, * global_rsv, it doesn't change except when the transaction commits. */ if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) - num_allocated += global_rsv->size; + num_allocated += calc_global_rsv_need_space(global_rsv); /* * in limited mode, we want to have some free space up to @@ -3627,8 +3761,8 @@ static void check_system_chunk(struct btrfs_trans_handle *trans, thresh = get_system_chunk_thresh(root, type); if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { - printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n", - left, thresh, type); + btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", + left, thresh, type); dump_space_info(info, 0, 0); } @@ -3746,7 +3880,7 @@ static int can_overcommit(struct btrfs_root *root, { struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; u64 profile = btrfs_get_alloc_profile(root, 0); - u64 rsv_size = 0; + u64 space_size; u64 avail; u64 used; u64 to_add; @@ -3754,18 +3888,16 @@ static int can_overcommit(struct btrfs_root *root, used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly; - spin_lock(&global_rsv->lock); - rsv_size = global_rsv->size; - spin_unlock(&global_rsv->lock); - /* * We only want to allow over committing if we have lots of actual space * free, but if we don't have enough space to handle the global reserve * space then we could end up having a real enospc problem when trying * to allocate a chunk or some other such important allocation. */ - rsv_size <<= 1; - if (used + rsv_size >= space_info->total_bytes) + spin_lock(&global_rsv->lock); + space_size = calc_global_rsv_need_space(global_rsv); + spin_unlock(&global_rsv->lock); + if (used + space_size >= space_info->total_bytes) return 0; used += space_info->bytes_may_use; @@ -3808,16 +3940,15 @@ static int can_overcommit(struct btrfs_root *root, return 0; } -void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, - unsigned long nr_pages) +static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, + unsigned long nr_pages) { struct super_block *sb = root->fs_info->sb; - int started; - /* If we can not start writeback, just sync all the delalloc file. */ - started = try_to_writeback_inodes_sb_nr(sb, nr_pages, - WB_REASON_FS_FREE_SPACE); - if (!started) { + if (down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); + up_read(&sb->s_umount); + } else { /* * We needn't worry the filesystem going from r/w to r/o though * we don't acquire ->s_umount mutex, because the filesystem @@ -3825,8 +3956,9 @@ void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, * the filesystem is readonly(all dirty pages are written to * the disk). */ - btrfs_start_delalloc_inodes(root, 0); - btrfs_wait_ordered_extents(root, 0); + btrfs_start_all_delalloc_inodes(root->fs_info, 0); + if (!current->journal_info) + btrfs_wait_all_ordered_extents(root->fs_info, 0); } } @@ -3856,7 +3988,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, if (delalloc_bytes == 0) { if (trans) return; - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root->fs_info, 0); return; } @@ -3884,7 +4016,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root->fs_info, 0); } else { time_left = schedule_timeout_killable(1); if (time_left) @@ -3922,7 +4054,8 @@ static int may_commit_transaction(struct btrfs_root *root, /* See if there is enough pinned space to make this reservation */ spin_lock(&space_info->lock); - if (space_info->bytes_pinned >= bytes) { + if (percpu_counter_compare(&space_info->total_bytes_pinned, + bytes) >= 0) { spin_unlock(&space_info->lock); goto commit; } @@ -3937,7 +4070,8 @@ static int may_commit_transaction(struct btrfs_root *root, spin_lock(&space_info->lock); spin_lock(&delayed_rsv->lock); - if (space_info->bytes_pinned + delayed_rsv->size < bytes) { + if (percpu_counter_compare(&space_info->total_bytes_pinned, + bytes - delayed_rsv->size) >= 0) { spin_unlock(&delayed_rsv->lock); spin_unlock(&space_info->lock); return -ENOSPC; @@ -4222,6 +4356,31 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, spin_unlock(&block_rsv->lock); } +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 min_bytes; + + if (global_rsv->space_info != dest->space_info) + return -ENOSPC; + + spin_lock(&global_rsv->lock); + min_bytes = div_factor(global_rsv->size, min_factor); + if (global_rsv->reserved < min_bytes + num_bytes) { + spin_unlock(&global_rsv->lock); + return -ENOSPC; + } + global_rsv->reserved -= num_bytes; + if (global_rsv->reserved < global_rsv->size) + global_rsv->full = 0; + spin_unlock(&global_rsv->lock); + + block_rsv_add_bytes(dest, num_bytes, 1); + return 0; +} + static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) @@ -4489,6 +4648,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + if (fs_info->quota_root) + fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; update_global_block_rsv(fs_info); @@ -4953,14 +5114,14 @@ static int update_block_group(struct btrfs_root *root, int factor; /* block accounting for super block */ - spin_lock(&info->delalloc_lock); + spin_lock(&info->delalloc_root_lock); old_val = btrfs_super_bytes_used(info->super_copy); if (alloc) old_val += num_bytes; else old_val -= num_bytes; btrfs_set_super_bytes_used(info->super_copy, old_val); - spin_unlock(&info->delalloc_lock); + spin_unlock(&info->delalloc_root_lock); while (total) { cache = btrfs_lookup_block_group(info, bytenr); @@ -5090,9 +5251,11 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, u64 bytenr, u64 num_bytes) { struct btrfs_block_group_cache *cache; + int ret; cache = btrfs_lookup_block_group(root->fs_info, bytenr); - BUG_ON(!cache); /* Logic error */ + if (!cache) + return -EINVAL; /* * pull in the free space cache (if any) so that our pin @@ -5105,8 +5268,82 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, pin_down_extent(root, cache, bytenr, num_bytes, 0); /* remove us from the free space cache (if we're there at all) */ - btrfs_remove_free_space(cache, bytenr, num_bytes); + ret = btrfs_remove_free_space(cache, bytenr, num_bytes); btrfs_put_block_group(cache); + return ret; +} + +static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) +{ + int ret; + struct btrfs_block_group_cache *block_group; + struct btrfs_caching_control *caching_ctl; + + block_group = btrfs_lookup_block_group(root->fs_info, start); + if (!block_group) + return -EINVAL; + + cache_block_group(block_group, 0); + caching_ctl = get_caching_control(block_group); + + if (!caching_ctl) { + /* Logic error */ + BUG_ON(!block_group_cache_done(block_group)); + ret = btrfs_remove_free_space(block_group, start, num_bytes); + } else { + mutex_lock(&caching_ctl->mutex); + + if (start >= caching_ctl->progress) { + ret = add_excluded_extent(root, start, num_bytes); + } else if (start + num_bytes <= caching_ctl->progress) { + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + } else { + num_bytes = caching_ctl->progress - start; + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + if (ret) + goto out_lock; + + num_bytes = (start + num_bytes) - + caching_ctl->progress; + start = caching_ctl->progress; + ret = add_excluded_extent(root, start, num_bytes); + } +out_lock: + mutex_unlock(&caching_ctl->mutex); + put_caching_control(caching_ctl); + } + btrfs_put_block_group(block_group); + return ret; +} + +int btrfs_exclude_logged_extents(struct btrfs_root *log, + struct extent_buffer *eb) +{ + struct btrfs_file_extent_item *item; + struct btrfs_key key; + int found_type; + int i; + + if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) + return 0; + + for (i = 0; i < btrfs_header_nritems(eb); i++) { + btrfs_item_key_to_cpu(eb, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + if (found_type == BTRFS_FILE_EXTENT_INLINE) + continue; + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + continue; + key.objectid = btrfs_file_extent_disk_bytenr(eb, item); + key.offset = btrfs_file_extent_disk_num_bytes(eb, item); + __exclude_logged_extent(log, key.objectid, key.offset); + } + return 0; } @@ -5172,6 +5409,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_caching_control *next; struct btrfs_caching_control *caching_ctl; struct btrfs_block_group_cache *cache; + struct btrfs_space_info *space_info; down_write(&fs_info->extent_commit_sem); @@ -5194,6 +5432,9 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, up_write(&fs_info->extent_commit_sem); + list_for_each_entry_rcu(space_info, &fs_info->space_info, list) + percpu_counter_set(&space_info->total_bytes_pinned, 0); + update_global_block_rsv(fs_info); } @@ -5291,6 +5532,27 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, return 0; } +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, + u64 owner, u64 root_objectid) +{ + struct btrfs_space_info *space_info; + u64 flags; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + } else { + flags = BTRFS_BLOCK_GROUP_DATA; + } + + space_info = __find_space_info(fs_info, flags); + BUG_ON(!space_info); /* Logic bug */ + percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); +} + + static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -5312,6 +5574,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int num_to_del = 1; u32 item_size; u64 refs; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); path = btrfs_alloc_path(); if (!path) @@ -5323,6 +5587,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; BUG_ON(!is_data && refs_to_drop != 1); + if (is_data) + skinny_metadata = 0; + ret = lookup_extent_backref(trans, extent_root, path, &iref, bytenr, num_bytes, parent, root_objectid, owner_objectid, @@ -5339,6 +5606,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, found_extent = 1; break; } + if (key.type == BTRFS_METADATA_ITEM_KEY && + key.offset == owner_objectid) { + found_extent = 1; + break; + } if (path->slots[0] - extent_slot > 5) break; extent_slot--; @@ -5364,12 +5636,39 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; + if (!is_data && skinny_metadata) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = owner_objectid; + } + ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); + if (ret > 0 && skinny_metadata && path->slots[0]) { + /* + * Couldn't find our skinny metadata item, + * see if we have ye olde extent item. + */ + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) + ret = 0; + } + + if (ret > 0 && skinny_metadata) { + skinny_metadata = false; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + btrfs_release_path(path); + ret = btrfs_search_slot(trans, extent_root, + &key, path, -1, 1); + } + if (ret) { - printk(KERN_ERR "umm, got %d back from search" - ", was looking for %llu\n", ret, - (unsigned long long)bytenr); + btrfs_err(info, "umm, got %d back from search, was looking for %llu", + ret, (unsigned long long)bytenr); if (ret > 0) btrfs_print_leaf(extent_root, path->nodes[0]); @@ -5383,13 +5682,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } else if (ret == -ENOENT) { btrfs_print_leaf(extent_root, path->nodes[0]); WARN_ON(1); - printk(KERN_ERR "btrfs unable to find ref byte nr %llu " - "parent %llu root %llu owner %llu offset %llu\n", - (unsigned long long)bytenr, - (unsigned long long)parent, - (unsigned long long)root_objectid, - (unsigned long long)owner_objectid, - (unsigned long long)owner_offset); + btrfs_err(info, + "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", + (unsigned long long)bytenr, + (unsigned long long)parent, + (unsigned long long)root_objectid, + (unsigned long long)owner_objectid, + (unsigned long long)owner_offset); } else { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5417,9 +5716,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); if (ret) { - printk(KERN_ERR "umm, got %d back from search" - ", was looking for %llu\n", ret, - (unsigned long long)bytenr); + btrfs_err(info, "umm, got %d back from search, was looking for %llu", + ret, (unsigned long long)bytenr); btrfs_print_leaf(extent_root, path->nodes[0]); } if (ret < 0) { @@ -5435,7 +5733,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); - if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && + key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); bi = (struct btrfs_tree_block_info *)(ei + 1); @@ -5443,7 +5742,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } refs = btrfs_extent_refs(leaf, ei); - BUG_ON(refs < refs_to_drop); + if (refs < refs_to_drop) { + btrfs_err(info, "trying to drop %d refs but we only have %Lu " + "for bytenr %Lu\n", refs_to_drop, refs, bytenr); + ret = -EINVAL; + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } refs -= refs_to_drop; if (refs > 0) { @@ -5468,6 +5773,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } } + add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, + root_objectid); } else { if (found_extent) { BUG_ON(is_data && refs_to_drop != @@ -5591,6 +5898,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 parent, int last_ref) { struct btrfs_block_group_cache *cache = NULL; + int pin = 1; int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { @@ -5623,8 +5931,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_add_free_space(cache, buf->start, buf->len); btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); + pin = 0; } out: + if (pin) + add_pinned_bytes(root->fs_info, buf->len, + btrfs_header_level(buf), + root->root_key.objectid); + /* * Deleting the buffer, clear the corrupt flag since it doesn't matter * anymore. @@ -5641,6 +5955,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, int ret; struct btrfs_fs_info *fs_info = root->fs_info; + add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); + /* * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. @@ -5758,7 +6074,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *orig_root, u64 num_bytes, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, - u64 data) + u64 flags) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; @@ -5769,8 +6085,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int empty_cluster = 2 * 1024 * 1024; struct btrfs_space_info *space_info; int loop = 0; - int index = __get_raid_index(data); - int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? + int index = __get_raid_index(flags); + int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool found_uncached_bg = false; bool failed_cluster_refill = false; @@ -5783,11 +6099,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(orig_root, num_bytes, empty_size, data); + trace_find_free_extent(orig_root, num_bytes, empty_size, flags); - space_info = __find_space_info(root->fs_info, data); + space_info = __find_space_info(root->fs_info, flags); if (!space_info) { - printk(KERN_ERR "No space info for %llu\n", data); + btrfs_err(root->fs_info, "No space info for %llu", flags); return -ENOSPC; } @@ -5798,13 +6114,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, if (btrfs_mixed_space_info(space_info)) use_cluster = false; - if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { + if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { last_ptr = &root->fs_info->meta_alloc_cluster; if (!btrfs_test_opt(root, SSD)) empty_cluster = 64 * 1024; } - if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster && + if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && btrfs_test_opt(root, SSD)) { last_ptr = &root->fs_info->data_alloc_cluster; } @@ -5833,7 +6149,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, * However if we are re-searching with an ideal block group * picked out then we don't care that the block group is cached. */ - if (block_group && block_group_bits(block_group, data) && + if (block_group && block_group_bits(block_group, flags) && block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || @@ -5871,7 +6187,7 @@ search: * raid types, but we want to make sure we only allocate * for the proper type. */ - if (!block_group_bits(block_group, data)) { + if (!block_group_bits(block_group, flags)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | @@ -5883,7 +6199,7 @@ search: * doesn't provide them, bail. This does allow us to * fill raid0 from raid1. */ - if ((data & extra) && !(block_group->flags & extra)) + if ((flags & extra) && !(block_group->flags & extra)) goto loop; } @@ -5914,7 +6230,7 @@ have_block_group: if (used_block_group != block_group && (!used_block_group || used_block_group->ro || - !block_group_bits(used_block_group, data))) { + !block_group_bits(used_block_group, flags))) { used_block_group = block_group; goto refill_cluster; } @@ -6110,7 +6426,7 @@ loop: index = 0; loop++; if (loop == LOOP_ALLOC_CHUNK) { - ret = do_chunk_alloc(trans, root, data, + ret = do_chunk_alloc(trans, root, flags, CHUNK_ALLOC_FORCE); /* * Do not bail out on ENOSPC since we @@ -6188,16 +6504,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, u64 data) + struct btrfs_key *ins, int is_data) { bool final_tried = false; + u64 flags; int ret; - data = btrfs_get_alloc_profile(root, data); + flags = btrfs_get_alloc_profile(root, is_data); again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, - hint_byte, ins, data); + hint_byte, ins, flags); if (ret == -ENOSPC) { if (!final_tried) { @@ -6210,10 +6527,10 @@ again: } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; - sinfo = __find_space_info(root->fs_info, data); - printk(KERN_ERR "btrfs allocation failed flags %llu, " - "wanted %llu\n", (unsigned long long)data, - (unsigned long long)num_bytes); + sinfo = __find_space_info(root->fs_info, flags); + btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", + (unsigned long long)flags, + (unsigned long long)num_bytes); if (sinfo) dump_space_info(sinfo, num_bytes, 1); } @@ -6232,8 +6549,8 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, cache = btrfs_lookup_block_group(root->fs_info, start); if (!cache) { - printk(KERN_ERR "Unable to find block group for %llu\n", - (unsigned long long)start); + btrfs_err(root->fs_info, "Unable to find block group for %llu", + (unsigned long long)start); return -ENOSPC; } @@ -6328,9 +6645,9 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ - printk(KERN_ERR "btrfs update block group failed for %llu " - "%llu\n", (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); + btrfs_err(fs_info, "update block group failed for %llu %llu", + (unsigned long long)ins->objectid, + (unsigned long long)ins->offset); BUG(); } return ret; @@ -6349,7 +6666,12 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; - u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); + u32 size = sizeof(*extent_item) + sizeof(*iref); + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); + + if (!skinny_metadata) + size += sizeof(*block_info); path = btrfs_alloc_path(); if (!path) @@ -6370,12 +6692,16 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_set_extent_generation(leaf, extent_item, trans->transid); btrfs_set_extent_flags(leaf, extent_item, flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); - block_info = (struct btrfs_tree_block_info *)(extent_item + 1); - btrfs_set_tree_block_key(leaf, block_info, key); - btrfs_set_tree_block_level(leaf, block_info, level); + if (skinny_metadata) { + iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + } else { + block_info = (struct btrfs_tree_block_info *)(extent_item + 1); + btrfs_set_tree_block_key(leaf, block_info, key); + btrfs_set_tree_block_level(leaf, block_info, level); + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + } - iref = (struct btrfs_extent_inline_ref *)(block_info + 1); if (parent > 0) { BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); btrfs_set_extent_inline_ref_type(leaf, iref, @@ -6390,11 +6716,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = update_block_group(root, ins->objectid, ins->offset, 1); + ret = update_block_group(root, ins->objectid, root->leafsize, 1); if (ret) { /* -ENOENT, logic error */ - printk(KERN_ERR "btrfs update block group failed for %llu " - "%llu\n", (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); + btrfs_err(fs_info, "update block group failed for %llu %llu", + (unsigned long long)ins->objectid, + (unsigned long long)ins->offset); BUG(); } return ret; @@ -6428,58 +6754,33 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_block_group_cache *block_group; - struct btrfs_caching_control *caching_ctl; - u64 start = ins->objectid; - u64 num_bytes = ins->offset; - - block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group, 0); - caching_ctl = get_caching_control(block_group); - - if (!caching_ctl) { - BUG_ON(!block_group_cache_done(block_group)); - ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); /* -ENOMEM */ - } else { - mutex_lock(&caching_ctl->mutex); - - if (start >= caching_ctl->progress) { - ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); /* -ENOMEM */ - } else if (start + num_bytes <= caching_ctl->progress) { - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - BUG_ON(ret); /* -ENOMEM */ - } else { - num_bytes = caching_ctl->progress - start; - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - BUG_ON(ret); /* -ENOMEM */ - - start = caching_ctl->progress; - num_bytes = ins->objectid + ins->offset - - caching_ctl->progress; - ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); /* -ENOMEM */ - } - mutex_unlock(&caching_ctl->mutex); - put_caching_control(caching_ctl); + /* + * Mixed block groups will exclude before processing the log so we only + * need to do the exlude dance if this fs isn't mixed. + */ + if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { + ret = __exclude_logged_extent(root, ins->objectid, ins->offset); + if (ret) + return ret; } + block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + if (!block_group) + return -EINVAL; + ret = btrfs_update_reserved_bytes(block_group, ins->offset, RESERVE_ALLOC_NO_ACCOUNT); BUG_ON(ret); /* logic error */ - btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); + btrfs_put_block_group(block_group); return ret; } -struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - int level) +static struct extent_buffer * +btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 bytenr, u32 blocksize, int level) { struct extent_buffer *buf; @@ -6522,51 +6823,51 @@ use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; int ret; + bool global_updated = false; block_rsv = get_block_rsv(trans, root); - if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); - /* - * If we couldn't reserve metadata bytes try and use some from - * the global reserve. - */ - if (ret && block_rsv != global_rsv) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - return ERR_PTR(ret); - } else if (ret) { - return ERR_PTR(ret); - } + if (unlikely(block_rsv->size == 0)) + goto try_reserve; +again: + ret = block_rsv_use_bytes(block_rsv, blocksize); + if (!ret) return block_rsv; + + if (block_rsv->failfast) + return ERR_PTR(ret); + + if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { + global_updated = true; + update_global_block_rsv(root->fs_info); + goto again; } - ret = block_rsv_use_bytes(block_rsv, blocksize); + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "btrfs: block rsv returned %d\n", ret); + } +try_reserve: + ret = reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); if (!ret) return block_rsv; - if (ret && !block_rsv->failfast) { - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL * 10, - /*DEFAULT_RATELIMIT_BURST*/ 1); - if (__ratelimit(&_rs)) - WARN(1, KERN_DEBUG - "btrfs: block rsv returned %d\n", ret); - } - ret = reserve_metadata_bytes(root, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); - if (!ret) { - return block_rsv; - } else if (ret && block_rsv != global_rsv) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - } + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve if its space type is the same as the global + * reservation. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && + block_rsv->space_info == global_rsv->space_info) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; } - - return ERR_PTR(-ENOSPC); + return ERR_PTR(ret); } static void unuse_block_rsv(struct btrfs_fs_info *fs_info, @@ -6594,7 +6895,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf; u64 flags = 0; int ret; - + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) @@ -6627,9 +6929,13 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, else memset(&extent_op->key, 0, sizeof(extent_op->key)); extent_op->flags_to_set = flags; - extent_op->update_key = 1; + if (skinny_metadata) + extent_op->update_key = 0; + else + extent_op->update_key = 1; extent_op->update_flags = 1; extent_op->is_data = 0; + extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, ins.objectid, @@ -6704,8 +7010,9 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; /* We don't lock the tree block, it's OK to be racy here */ - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, - &refs, &flags); + ret = btrfs_lookup_extent_info(trans, root, bytenr, + wc->level - 1, 1, &refs, + &flags); /* We don't care about errors in readahead. */ if (ret < 0) continue; @@ -6772,7 +7079,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { BUG_ON(!path->locks[level]); ret = btrfs_lookup_extent_info(trans, root, - eb->start, eb->len, + eb->start, level, 1, &wc->refs[level], &wc->flags[level]); BUG_ON(ret == -ENOMEM); @@ -6800,7 +7107,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, root, eb->start, - eb->len, flag, 0); + eb->len, flag, + btrfs_header_level(eb), 0); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } @@ -6870,7 +7178,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, &wc->refs[level - 1], &wc->flags[level - 1]); if (ret < 0) { @@ -6878,7 +7186,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, return ret; } - BUG_ON(wc->refs[level - 1] == 0); + if (unlikely(wc->refs[level - 1] == 0)) { + btrfs_err(root->fs_info, "Missing references."); + BUG(); + } *lookup_info = 0; if (wc->stage == DROP_REFERENCE) { @@ -6917,8 +7228,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (reada && level == 1) reada_walk_down(trans, root, wc, path); next = read_tree_block(root, bytenr, blocksize, generation); - if (!next) + if (!next || !extent_buffer_uptodate(next)) { + free_extent_buffer(next); return -EIO; + } btrfs_tree_lock(next); btrfs_set_lock_blocking(next); } @@ -7001,7 +7314,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; ret = btrfs_lookup_extent_info(trans, root, - eb->start, eb->len, + eb->start, level, 1, &wc->refs[level], &wc->flags[level]); if (ret < 0) { @@ -7137,6 +7450,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * reference count by one. if update_ref is true, this function * also make sure backrefs for the shared block and all lower level * blocks are properly updated. + * + * If called with for_reloc == 0, may exit early with -EAGAIN */ int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int update_ref, @@ -7211,8 +7526,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_lookup_extent_info(trans, root, path->nodes[level]->start, - path->nodes[level]->len, - &wc->refs[level], + level, 1, &wc->refs[level], &wc->flags[level]); if (ret < 0) { err = ret; @@ -7238,6 +7552,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { + if (!for_reloc && btrfs_need_cleaner_sleep(root)) { + pr_debug("btrfs: drop snapshot early exit\n"); + err = -EAGAIN; + goto out_end_trans; + } + ret = walk_down_tree(trans, root, path, wc); if (ret < 0) { err = ret; @@ -7295,8 +7615,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_find_last_root(tree_root, root->root_key.objectid, - NULL, NULL); + ret = btrfs_find_root(tree_root, &root->root_key, path, + NULL, NULL); if (ret < 0) { btrfs_abort_transaction(trans, tree_root, ret); err = ret; @@ -7313,11 +7633,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } if (root->in_radix) { - btrfs_free_fs_root(tree_root->fs_info, root); + btrfs_drop_and_free_fs_root(tree_root->fs_info, root); } else { free_extent_buffer(root->node); free_extent_buffer(root->commit_root); - kfree(root); + btrfs_put_fs_root(root); } out_end_trans: btrfs_end_transaction_throttle(trans, tree_root); @@ -7630,6 +7950,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) struct btrfs_space_info *space_info; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_device *device; + struct btrfs_trans_handle *trans; u64 min_free; u64 dev_min = 1; u64 dev_nr = 0; @@ -7716,6 +8037,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) do_div(min_free, dev_min); } + /* We need to do this so that we can look at pending chunks */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + mutex_lock(&root->fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { u64 dev_offset; @@ -7726,7 +8054,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) */ if (device->total_bytes > device->bytes_used + min_free && !device->is_tgtdev_for_dev_replace) { - ret = find_free_dev_extent(device, min_free, + ret = find_free_dev_extent(trans, device, min_free, &dev_offset, NULL); if (!ret) dev_nr++; @@ -7738,6 +8066,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) } } mutex_unlock(&root->fs_info->chunk_mutex); + btrfs_end_transaction(trans, root); out: btrfs_put_block_group(block_group); return ret; @@ -7880,6 +8209,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) dump_space_info(space_info, 0, 0); } } + percpu_counter_destroy(&space_info->total_bytes_pinned); list_del(&space_info->list); kfree(space_info); } @@ -8020,10 +8350,26 @@ int btrfs_read_block_groups(struct btrfs_root *root) free_excluded_extents(root, cache); } + ret = btrfs_add_block_group_cache(root->fs_info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + goto error; + } + ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), &space_info); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_remove_free_space_cache(cache); + spin_lock(&info->block_group_cache_lock); + rb_erase(&cache->cache_node, + &info->block_group_cache_tree); + spin_unlock(&info->block_group_cache_lock); + btrfs_put_block_group(cache); + goto error; + } + cache->space_info = space_info; spin_lock(&cache->space_info->lock); cache->space_info->bytes_readonly += cache->bytes_super; @@ -8031,9 +8377,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) __link_block_group(space_info, cache); - ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); /* Logic error */ - set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) set_block_group_ro(cache, 1); @@ -8089,6 +8432,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, sizeof(item)); if (ret) btrfs_abort_transaction(trans, extent_root, ret); + ret = btrfs_finish_chunk_alloc(trans, extent_root, + key.objectid, key.offset); + if (ret) + btrfs_abort_transaction(trans, extent_root, ret); } } @@ -8156,9 +8503,24 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, cache); + ret = btrfs_add_block_group_cache(root->fs_info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_remove_free_space_cache(cache); + spin_lock(&root->fs_info->block_group_cache_lock); + rb_erase(&cache->cache_node, + &root->fs_info->block_group_cache_tree); + spin_unlock(&root->fs_info->block_group_cache_lock); + btrfs_put_block_group(cache); + return ret; + } update_global_block_rsv(root->fs_info); spin_lock(&cache->space_info->lock); @@ -8167,9 +8529,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, __link_block_group(cache->space_info, cache); - ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); /* Logic error */ - list_add_tail(&cache->new_bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); @@ -8414,8 +8773,15 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) if (end - start >= range->minlen) { if (!block_group_cache_done(cache)) { ret = cache_block_group(cache, 0); - if (!ret) - wait_block_group_cache_done(cache); + if (ret) { + btrfs_put_block_group(cache); + break; + } + ret = wait_block_group_cache_done(cache); + if (ret) { + btrfs_put_block_group(cache); + break; + } } ret = btrfs_trim_block_group(cache, &group_trimmed, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cdee391fc7bf..583d98bd065e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -23,13 +23,83 @@ static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; +static struct bio_set *btrfs_bioset; +#ifdef CONFIG_BTRFS_DEBUG static LIST_HEAD(buffers); static LIST_HEAD(states); -#define LEAK_DEBUG 0 -#if LEAK_DEBUG static DEFINE_SPINLOCK(leak_lock); + +static inline +void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_add(new, head); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline +void btrfs_leak_debug_del(struct list_head *entry) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_del(entry); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline +void btrfs_leak_debug_check(void) +{ + struct extent_state *state; + struct extent_buffer *eb; + + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, leak_list); + printk(KERN_ERR "btrfs state leak: start %llu end %llu " + "state %lu in tree %p refs %d\n", + (unsigned long long)state->start, + (unsigned long long)state->end, + state->state, state->tree, atomic_read(&state->refs)); + list_del(&state->leak_list); + kmem_cache_free(extent_state_cache, state); + } + + while (!list_empty(&buffers)) { + eb = list_entry(buffers.next, struct extent_buffer, leak_list); + printk(KERN_ERR "btrfs buffer leak start %llu len %lu " + "refs %d\n", (unsigned long long)eb->start, + eb->len, atomic_read(&eb->refs)); + list_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); + } +} + +#define btrfs_debug_check_extent_io_range(inode, start, end) \ + __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end)) +static inline void __btrfs_debug_check_extent_io_range(const char *caller, + struct inode *inode, u64 start, u64 end) +{ + u64 isize = i_size_read(inode); + + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + printk_ratelimited(KERN_DEBUG + "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n", + caller, + (unsigned long long)btrfs_ino(inode), + (unsigned long long)isize, + (unsigned long long)start, + (unsigned long long)end); + } +} +#else +#define btrfs_leak_debug_add(new, head) do {} while (0) +#define btrfs_leak_debug_del(entry) do {} while (0) +#define btrfs_leak_debug_check() do {} while (0) +#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif #define BUFFER_LRU_MAX 64 @@ -75,38 +145,26 @@ int __init extent_io_init(void) SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; + + btrfs_bioset = bioset_create(BIO_POOL_SIZE, + offsetof(struct btrfs_io_bio, bio)); + if (!btrfs_bioset) + goto free_buffer_cache; return 0; +free_buffer_cache: + kmem_cache_destroy(extent_buffer_cache); + extent_buffer_cache = NULL; + free_state_cache: kmem_cache_destroy(extent_state_cache); + extent_state_cache = NULL; return -ENOMEM; } void extent_io_exit(void) { - struct extent_state *state; - struct extent_buffer *eb; - - while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, leak_list); - printk(KERN_ERR "btrfs state leak: start %llu end %llu " - "state %lu in tree %p refs %d\n", - (unsigned long long)state->start, - (unsigned long long)state->end, - state->state, state->tree, atomic_read(&state->refs)); - list_del(&state->leak_list); - kmem_cache_free(extent_state_cache, state); - - } - - while (!list_empty(&buffers)) { - eb = list_entry(buffers.next, struct extent_buffer, leak_list); - printk(KERN_ERR "btrfs buffer leak start %llu len %lu " - "refs %d\n", (unsigned long long)eb->start, - eb->len, atomic_read(&eb->refs)); - list_del(&eb->leak_list); - kmem_cache_free(extent_buffer_cache, eb); - } + btrfs_leak_debug_check(); /* * Make sure all delayed rcu free are flushed before we @@ -117,6 +175,8 @@ void extent_io_exit(void) kmem_cache_destroy(extent_state_cache); if (extent_buffer_cache) kmem_cache_destroy(extent_buffer_cache); + if (btrfs_bioset) + bioset_free(btrfs_bioset); } void extent_io_tree_init(struct extent_io_tree *tree, @@ -134,9 +194,6 @@ void extent_io_tree_init(struct extent_io_tree *tree, static struct extent_state *alloc_extent_state(gfp_t mask) { struct extent_state *state; -#if LEAK_DEBUG - unsigned long flags; -#endif state = kmem_cache_alloc(extent_state_cache, mask); if (!state) @@ -144,11 +201,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) state->state = 0; state->private = 0; state->tree = NULL; -#if LEAK_DEBUG - spin_lock_irqsave(&leak_lock, flags); - list_add(&state->leak_list, &states); - spin_unlock_irqrestore(&leak_lock, flags); -#endif + btrfs_leak_debug_add(&state->leak_list, &states); atomic_set(&state->refs, 1); init_waitqueue_head(&state->wq); trace_alloc_extent_state(state, mask, _RET_IP_); @@ -160,15 +213,8 @@ void free_extent_state(struct extent_state *state) if (!state) return; if (atomic_dec_and_test(&state->refs)) { -#if LEAK_DEBUG - unsigned long flags; -#endif WARN_ON(state->tree); -#if LEAK_DEBUG - spin_lock_irqsave(&leak_lock, flags); - list_del(&state->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif + btrfs_leak_debug_del(&state->leak_list); trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } @@ -308,21 +354,21 @@ static void merge_state(struct extent_io_tree *tree, } static void set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, int *bits) + struct extent_state *state, unsigned long *bits) { if (tree->ops && tree->ops->set_bit_hook) tree->ops->set_bit_hook(tree->mapping->host, state, bits); } static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, int *bits) + struct extent_state *state, unsigned long *bits) { if (tree->ops && tree->ops->clear_bit_hook) tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, int *bits); + struct extent_state *state, unsigned long *bits); /* * insert an extent_state struct into the tree. 'bits' are set on the @@ -336,7 +382,7 @@ static void set_state_bits(struct extent_io_tree *tree, */ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, - int *bits) + unsigned long *bits) { struct rb_node *node; @@ -424,10 +470,10 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - int *bits, int wake) + unsigned long *bits, int wake) { struct extent_state *next; - int bits_to_clear = *bits & ~EXTENT_CTLBITS; + unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -463,7 +509,7 @@ alloc_extent_state_atomic(struct extent_state *prealloc) return prealloc; } -void extent_io_tree_panic(struct extent_io_tree *tree, int err) +static void extent_io_tree_panic(struct extent_io_tree *tree, int err) { btrfs_panic(tree_fs_info(tree), err, "Locking error: " "Extent tree was modified by another " @@ -483,7 +529,7 @@ void extent_io_tree_panic(struct extent_io_tree *tree, int err) * This takes the tree lock, and returns 0 on success and < 0 on error. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, + unsigned long bits, int wake, int delete, struct extent_state **cached_state, gfp_t mask) { @@ -495,6 +541,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int err; int clear = 0; + btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + + if (bits & EXTENT_DELALLOC) + bits |= EXTENT_NORESERVE; + if (delete) bits |= ~EXTENT_CTLBITS; bits |= EXTENT_FIRST_DELALLOC; @@ -644,11 +695,14 @@ static void wait_on_state(struct extent_io_tree *tree, * The range [start, end] is inclusive. * The tree lock is taken by this function */ -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) +static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned long bits) { struct extent_state *state; struct rb_node *node; + btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + spin_lock(&tree->lock); again: while (1) { @@ -685,9 +739,9 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - int *bits) + unsigned long *bits) { - int bits_to_set = *bits & ~EXTENT_CTLBITS; + unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS; set_state_cb(tree, state, bits); if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { @@ -730,8 +784,9 @@ static void uncache_state(struct extent_state **cached_ptr) static int __must_check __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask) + unsigned long bits, unsigned long exclusive_bits, + u64 *failed_start, struct extent_state **cached_state, + gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -740,6 +795,8 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_start; u64 last_end; + btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && (mask & __GFP_WAIT)) { @@ -923,9 +980,9 @@ search_again: goto again; } -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, - u64 *failed_start, struct extent_state **cached_state, - gfp_t mask) +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned long bits, u64 * failed_start, + struct extent_state **cached_state, gfp_t mask) { return __set_extent_bit(tree, start, end, bits, 0, failed_start, cached_state, mask); @@ -950,7 +1007,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, * boundary bits like LOCK. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int clear_bits, + unsigned long bits, unsigned long clear_bits, struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; @@ -960,6 +1017,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_start; u64 last_end; + btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + again: if (!prealloc && (mask & __GFP_WAIT)) { prealloc = alloc_extent_state(mask); @@ -1143,14 +1202,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, } int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask) + unsigned long bits, gfp_t mask) { return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); } int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask) + unsigned long bits, gfp_t mask) { return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); } @@ -1189,7 +1248,7 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, cached_state, mask); } @@ -1205,7 +1264,7 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached_state) + unsigned long bits, struct extent_state **cached_state) { int err; u64 failed_start; @@ -1313,8 +1372,9 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) * return it. tree->lock must be held. NULL will returned if * nothing was found after 'start' */ -struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, int bits) +static struct extent_state * +find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, unsigned long bits) { struct rb_node *node; struct extent_state *state; @@ -1348,7 +1408,7 @@ out: * If nothing was found, 1 is returned. If found something, return 0. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits, + u64 *start_ret, u64 *end_ret, unsigned long bits, struct extent_state **cached_state) { struct extent_state *state; @@ -1638,7 +1698,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, unsigned long end_index = end >> PAGE_CACHE_SHIFT; unsigned long nr_pages = end_index - index + 1; int i; - int clear_bits = 0; + unsigned long clear_bits = 0; if (op & EXTENT_CLEAR_UNLOCK) clear_bits |= EXTENT_LOCKED; @@ -1777,6 +1837,64 @@ out: return ret; } +void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[], + int count) +{ + struct rb_node *node; + struct extent_state *state; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + BUG_ON(!node); + + state = rb_entry(node, struct extent_state, rb_node); + BUG_ON(state->start != start); + + while (count) { + state->private = *csums++; + count--; + state = next_state(state); + } + spin_unlock(&tree->lock); +} + +static inline u64 __btrfs_get_bio_offset(struct bio *bio, int bio_index) +{ + struct bio_vec *bvec = bio->bi_io_vec + bio_index; + + return page_offset(bvec->bv_page) + bvec->bv_offset; +} + +void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, int bio_index, + u32 csums[], int count) +{ + struct rb_node *node; + struct extent_state *state = NULL; + u64 start; + + spin_lock(&tree->lock); + do { + start = __btrfs_get_bio_offset(bio, bio_index); + if (state == NULL || state->start != start) { + node = tree_search(tree, start); + BUG_ON(!node); + + state = rb_entry(node, struct extent_state, rb_node); + BUG_ON(state->start != start); + } + state->private = *csums++; + count--; + bio_index++; + + state = next_state(state); + } while (count); + spin_unlock(&tree->lock); +} + int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) { struct rb_node *node; @@ -1811,7 +1929,7 @@ out: * range is found set. */ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled, struct extent_state *cached) + unsigned long bits, int filled, struct extent_state *cached) { struct extent_state *state = NULL; struct rb_node *node; @@ -1873,28 +1991,6 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) } /* - * helper function to unlock a page if all the extents in the tree - * for that page are unlocked - */ -static void check_page_locked(struct extent_io_tree *tree, struct page *page) -{ - u64 start = page_offset(page); - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) - unlock_page(page); -} - -/* - * helper function to end page writeback if all the extents - * in the tree for that page are done with writeback - */ -static void check_page_writeback(struct extent_io_tree *tree, - struct page *page) -{ - end_page_writeback(page); -} - -/* * When IO fails, either with EIO or csum verification fails, we * try other mirrors that might have a good copy of the data. This * io_failure_record is used to record state as we go through all the @@ -1971,7 +2067,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) return 0; - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; bio->bi_private = &compl; @@ -2261,7 +2357,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, return -EIO; } - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) { free_io_failure(inode, failrec, 0); return -EIO; @@ -2323,19 +2419,24 @@ static void end_bio_extent_writepage(struct bio *bio, int err) struct extent_io_tree *tree; u64 start; u64 end; - int whole_page; do { struct page *page = bvec->bv_page; tree = &BTRFS_I(page->mapping->host)->io_tree; - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; + /* We always issue full-page reads, but if some block + * in a page fails to read, blk_update_request() will + * advance bv_offset and adjust bv_len to compensate. + * Print a warning for nonzero offsets, and an error + * if they don't add up to a full page. */ + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) + printk("%s page write in btrfs with offset %u and length %u\n", + bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE + ? KERN_ERR "partial" : KERN_INFO "incomplete", + bvec->bv_offset, bvec->bv_len); - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; + start = page_offset(page); + end = start + bvec->bv_offset + bvec->bv_len - 1; if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); @@ -2343,10 +2444,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err) if (end_extent_writepage(page, err, start, end)) continue; - if (whole_page) - end_page_writeback(page); - else - check_page_writeback(tree, page); + end_page_writeback(page); } while (bvec >= bio->bi_io_vec); bio_put(bio); @@ -2371,7 +2469,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct extent_io_tree *tree; u64 start; u64 end; - int whole_page; int mirror; int ret; @@ -2382,19 +2479,27 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct page *page = bvec->bv_page; struct extent_state *cached = NULL; struct extent_state *state; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct inode *inode = page->mapping->host; pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " - "mirror=%ld\n", (u64)bio->bi_sector, err, - (long int)bio->bi_bdev); - tree = &BTRFS_I(page->mapping->host)->io_tree; - - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; - - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; + "mirror=%lu\n", (u64)bio->bi_sector, err, + io_bio->mirror_num); + tree = &BTRFS_I(inode)->io_tree; + + /* We always issue full-page reads, but if some block + * in a page fails to read, blk_update_request() will + * advance bv_offset and adjust bv_len to compensate. + * Print a warning for nonzero offsets, and an error + * if they don't add up to a full page. */ + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) + printk("%s page read in btrfs with offset %u and length %u\n", + bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE + ? KERN_ERR "partial" : KERN_INFO "incomplete", + bvec->bv_offset, bvec->bv_len); + + start = page_offset(page); + end = start + bvec->bv_offset + bvec->bv_len - 1; if (++bvec <= bvec_end) prefetchw(&bvec->bv_page->flags); @@ -2410,7 +2515,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } spin_unlock(&tree->lock); - mirror = (int)(unsigned long)bio->bi_bdev; + mirror = io_bio->mirror_num; if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, state, mirror); @@ -2453,39 +2558,43 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); - if (whole_page) { - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); + if (uptodate) { + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + + /* Zero out the end if this page straddles i_size */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index == end_index && offset) + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + SetPageUptodate(page); } else { - if (uptodate) { - check_page_uptodate(tree, page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - check_page_locked(tree, page); + ClearPageUptodate(page); + SetPageError(page); } + unlock_page(page); } while (bvec <= bvec_end); bio_put(bio); } +/* + * this allocates from the btrfs_bioset. We're returning a bio right now + * but you can call btrfs_io_bio for the appropriate container_of magic + */ struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags) { struct bio *bio; - bio = bio_alloc(gfp_flags, nr_vecs); + bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); + while (!bio && (nr_vecs /= 2)) { + bio = bio_alloc_bioset(gfp_flags, + nr_vecs, btrfs_bioset); + } } if (bio) { @@ -2496,6 +2605,19 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } +struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) +{ + return bio_clone_bioset(bio, gfp_mask, btrfs_bioset); +} + + +/* this also allocates from the btrfs_bioset */ +struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +{ + return bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); +} + + static int __must_check submit_one_bio(int rw, struct bio *bio, int mirror_num, unsigned long bio_flags) { @@ -2560,8 +2682,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (old_compressed) contig = bio->bi_sector == sector; else - contig = bio->bi_sector + (bio->bi_size >> 9) == - sector; + contig = bio_end_sector(bio) == sector; if (prev_bio_flags != bio_flags || !contig || merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || @@ -2596,7 +2717,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, return ret; } -void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) +static void attach_extent_buffer_page(struct extent_buffer *eb, + struct page *page) { if (!PagePrivate(page)) { SetPagePrivate(page); @@ -2626,7 +2748,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, struct bio **bio, int mirror_num, - unsigned long *bio_flags) + unsigned long *bio_flags, int rw) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -2772,7 +2894,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, } pnr -= page->index; - ret = submit_extent_page(READ, tree, page, + ret = submit_extent_page(rw, tree, page, sector, disk_io_size, pg_offset, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, @@ -2805,7 +2927,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, int ret; ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, - &bio_flags); + &bio_flags, READ); if (bio) ret = submit_one_bio(READ, bio, mirror_num, bio_flags); return ret; @@ -2874,7 +2996,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0); + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); unlock_page(page); return 0; } @@ -3104,7 +3226,7 @@ static int eb_wait(void *word) return 0; } -static void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +void wait_on_extent_buffer_writeback(struct extent_buffer *eb) { wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, TASK_UNINTERRUPTIBLE); @@ -3229,7 +3351,7 @@ static int write_one_eb(struct extent_buffer *eb, u64 offset = eb->start; unsigned long i, num_pages; unsigned long bio_flags = 0; - int rw = (epd->sync_io ? WRITE_SYNC : WRITE); + int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; int ret = 0; clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); @@ -3666,14 +3788,14 @@ int extent_readpages(struct extent_io_tree *tree, continue; for (i = 0; i < nr; i++) { __extent_read_full_page(tree, pagepool[i], get_extent, - &bio, 0, &bio_flags); + &bio, 0, &bio_flags, READ); page_cache_release(pagepool[i]); } nr = 0; } for (i = 0; i < nr; i++) { __extent_read_full_page(tree, pagepool[i], get_extent, - &bio, 0, &bio_flags); + &bio, 0, &bio_flags, READ); page_cache_release(pagepool[i]); } @@ -3714,9 +3836,9 @@ int extent_invalidatepage(struct extent_io_tree *tree, * are locked or under IO and drops the related state bits if it is safe * to drop the page. */ -int try_release_extent_state(struct extent_map_tree *map, - struct extent_io_tree *tree, struct page *page, - gfp_t mask) +static int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, + struct page *page, gfp_t mask) { u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; @@ -3913,7 +4035,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0, &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent, @@ -4000,19 +4122,14 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, out_free: free_extent_map(em); out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, + unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state, GFP_NOFS); return ret; } static void __free_extent_buffer(struct extent_buffer *eb) { -#if LEAK_DEBUG - unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); - list_del(&eb->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif + btrfs_leak_debug_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } @@ -4022,9 +4139,6 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, gfp_t mask) { struct extent_buffer *eb = NULL; -#if LEAK_DEBUG - unsigned long flags; -#endif eb = kmem_cache_zalloc(extent_buffer_cache, mask); if (eb == NULL) @@ -4044,11 +4158,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); -#if LEAK_DEBUG - spin_lock_irqsave(&leak_lock, flags); - list_add(&eb->leak_list, &buffers); - spin_unlock_irqrestore(&leak_lock, flags); -#endif + btrfs_leak_debug_add(&eb->leak_list, &buffers); + spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); atomic_set(&eb->io_pages, 0); @@ -4386,7 +4497,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) } /* Expects to have eb->eb_lock already held */ -static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask) +static int release_extent_buffer(struct extent_buffer *eb) { WARN_ON(atomic_read(&eb->refs) == 0); if (atomic_dec_and_test(&eb->refs)) { @@ -4444,7 +4555,7 @@ void free_extent_buffer(struct extent_buffer *eb) * I know this is terrible, but it's temporary until we stop tracking * the uptodate bits and such for the extent buffers. */ - release_extent_buffer(eb, GFP_ATOMIC); + release_extent_buffer(eb); } void free_extent_buffer_stale(struct extent_buffer *eb) @@ -4458,7 +4569,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb) if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) atomic_dec(&eb->refs); - release_extent_buffer(eb, GFP_NOFS); + release_extent_buffer(eb); } void clear_extent_buffer_dirty(struct extent_buffer *eb) @@ -4510,17 +4621,6 @@ int set_extent_buffer_dirty(struct extent_buffer *eb) return was_dirty; } -static int range_straddles_pages(u64 start, u64 len) -{ - if (len < PAGE_CACHE_SIZE) - return 1; - if (start & (PAGE_CACHE_SIZE - 1)) - return 1; - if ((start + len) & (PAGE_CACHE_SIZE - 1)) - return 1; - return 0; -} - int clear_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; @@ -4552,37 +4652,6 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb) return 0; } -int extent_range_uptodate(struct extent_io_tree *tree, - u64 start, u64 end) -{ - struct page *page; - int ret; - int pg_uptodate = 1; - int uptodate; - unsigned long index; - - if (range_straddles_pages(start, end - start + 1)) { - ret = test_range_bit(tree, start, end, - EXTENT_UPTODATE, 1, NULL); - if (ret) - return 1; - } - while (start <= end) { - index = start >> PAGE_CACHE_SHIFT; - page = find_get_page(tree->mapping, index); - if (!page) - return 1; - uptodate = PageUptodate(page); - page_cache_release(page); - if (!uptodate) { - pg_uptodate = 0; - break; - } - start += PAGE_CACHE_SIZE; - } - return pg_uptodate; -} - int extent_buffer_uptodate(struct extent_buffer *eb) { return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); @@ -4645,7 +4714,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, - mirror_num, &bio_flags); + mirror_num, &bio_flags, + READ | REQ_META); if (err) ret = err; } else { @@ -4654,7 +4724,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } if (bio) { - err = submit_one_bio(READ, bio, mirror_num, bio_flags); + err = submit_one_bio(READ | REQ_META, bio, mirror_num, + bio_flags); if (err) return err; } @@ -5018,7 +5089,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } } -int try_release_extent_buffer(struct page *page, gfp_t mask) +int try_release_extent_buffer(struct page *page) { struct extent_buffer *eb; @@ -5048,9 +5119,6 @@ int try_release_extent_buffer(struct page *page, gfp_t mask) } spin_unlock(&page->mapping->private_lock); - if ((mask & GFP_NOFS) == GFP_NOFS) - mask = GFP_NOFS; - /* * If tree ref isn't set then we know the ref on this eb is a real ref, * so just return, this page will likely be freed soon anyway. @@ -5060,5 +5128,5 @@ int try_release_extent_buffer(struct page *page, gfp_t mask) return 0; } - return release_extent_buffer(eb, mask); + return release_extent_buffer(eb); } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 258c92156857..3b8c4e26e1da 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -19,6 +19,7 @@ #define EXTENT_FIRST_DELALLOC (1 << 12) #define EXTENT_NEED_WAIT (1 << 13) #define EXTENT_DAMAGED (1 << 14) +#define EXTENT_NORESERVE (1 << 15) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) @@ -81,9 +82,9 @@ struct extent_io_ops { int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); void (*set_bit_hook)(struct inode *inode, struct extent_state *state, - int *bits); + unsigned long *bits); void (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - int *bits); + unsigned long *bits); void (*merge_extent_hook)(struct inode *inode, struct extent_state *new, struct extent_state *other); @@ -116,7 +117,9 @@ struct extent_state { /* for use by the FS */ u64 private; +#ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; +#endif }; #define INLINE_EXTENT_BUFFER_PAGES 16 @@ -132,7 +135,6 @@ struct extent_buffer { atomic_t refs; atomic_t io_pages; int read_mirror; - struct list_head leak_list; struct rcu_head rcu_head; pid_t lock_owner; @@ -159,6 +161,9 @@ struct extent_buffer { wait_queue_head_t read_lock_wq; wait_queue_head_t lock_wq; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; +#ifdef CONFIG_BTRFS_DEBUG + struct list_head leak_list; +#endif }; static inline void extent_set_compress_type(unsigned long *bio_flags, @@ -185,13 +190,10 @@ void extent_io_tree_init(struct extent_io_tree *tree, int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); -int try_release_extent_buffer(struct page *page, gfp_t mask); -int try_release_extent_state(struct extent_map_tree *map, - struct extent_io_tree *tree, struct page *page, - gfp_t mask); +int try_release_extent_buffer(struct page *page); int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached); + unsigned long bits, struct extent_state **cached); int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached, gfp_t mask); @@ -207,16 +209,17 @@ u64 count_range_bits(struct extent_io_tree *tree, void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled, struct extent_state *cached_state); + unsigned long bits, int filled, + struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask); + unsigned long bits, gfp_t mask); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, struct extent_state **cached, - gfp_t mask); + unsigned long bits, int wake, int delete, + struct extent_state **cached, gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask); + unsigned long bits, gfp_t mask); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, u64 *failed_start, + unsigned long bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); @@ -229,17 +232,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int clear_bits, + unsigned long bits, unsigned long clear_bits, struct extent_state **cached_state, gfp_t mask); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits, + u64 *start_ret, u64 *end_ret, unsigned long bits, struct extent_state **cached_state); -struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, int bits); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); int extent_write_full_page(struct extent_io_tree *tree, struct page *page, @@ -261,6 +262,10 @@ int extent_readpages(struct extent_io_tree *tree, int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent); int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); +void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[], + int count); +void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, + int bvec_index, u32 csums[], int count); int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); @@ -278,6 +283,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb); int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num); +void wait_on_extent_buffer_writeback(struct extent_buffer *eb); static inline unsigned long num_extent_pages(u64 start, u64 len) { @@ -313,7 +319,6 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len); void memset_extent_buffer(struct extent_buffer *eb, char c, unsigned long start, unsigned long len); -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); void clear_extent_buffer_dirty(struct extent_buffer *eb); int set_extent_buffer_dirty(struct extent_buffer *eb); int set_extent_buffer_uptodate(struct extent_buffer *eb); @@ -323,8 +328,6 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **map, unsigned long *map_start, unsigned long *map_len); -int extent_range_uptodate(struct extent_io_tree *tree, - u64 start, u64 end); int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, @@ -334,6 +337,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags); +struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs); +struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask); struct btrfs_fs_info; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 2834ca5768ea..a4a7a1a8da95 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -174,6 +174,14 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) test_bit(EXTENT_FLAG_LOGGING, &next->flags)) return 0; + /* + * We don't want to merge stuff that hasn't been written to the log yet + * since it may not reflect exactly what is on disk, and that would be + * bad. + */ + if (!list_empty(&prev->list) || !list_empty(&next->list)) + return 0; + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -209,9 +217,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); - list_move(&em->list, &tree->modified_extents); - list_del_init(&merge->list); rb_erase(&merge->rb_node, &tree->map); free_extent_map(merge); } @@ -227,7 +233,6 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) merge->in_tree = 0; em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); - list_del_init(&merge->list); free_extent_map(merge); } } @@ -302,7 +307,7 @@ void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) * reference dropped if the merge attempt was successful. */ int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em) + struct extent_map *em, int modified) { int ret = 0; struct rb_node *rb; @@ -324,7 +329,10 @@ int add_extent_mapping(struct extent_map_tree *tree, em->mod_start = em->start; em->mod_len = em->len; - try_merge_map(tree, em); + if (modified) + list_move(&em->list, &tree->modified_extents); + else + try_merge_map(tree, em); out: return ret; } @@ -337,8 +345,9 @@ static u64 range_end(u64 start, u64 len) return start + len; } -struct extent_map *__lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len, int strict) +static struct extent_map * +__lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len, int strict) { struct extent_map *em; struct rb_node *rb_node; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index c6598c89cff8..61adc44b7805 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -26,6 +26,7 @@ struct extent_map { u64 mod_len; u64 orig_start; u64 orig_block_len; + u64 ram_bytes; u64 block_start; u64 block_len; u64 generation; @@ -61,7 +62,7 @@ void extent_map_tree_init(struct extent_map_tree *tree); struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em); + struct extent_map *em, int modified); int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *alloc_extent_map(void); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c4628a201cb3..a7bfc9541803 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -34,8 +34,7 @@ #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ - sizeof(struct btrfs_sector_sum) * \ - (r)->sectorsize - (r)->sectorsize) + sizeof(u32) * (r)->sectorsize) int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -83,10 +82,11 @@ out: return ret; } -struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, int cow) +static struct btrfs_csum_item * +btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, int cow) { int ret; struct btrfs_key file_key; @@ -152,32 +152,12 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } -u64 btrfs_file_extent_length(struct btrfs_path *path) -{ - int extent_type; - struct btrfs_file_extent_item *fi; - u64 len; - - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(path->nodes[0], fi); - - if (extent_type == BTRFS_FILE_EXTENT_REG || - extent_type == BTRFS_FILE_EXTENT_PREALLOC) - len = btrfs_file_extent_num_bytes(path->nodes[0], fi); - else if (extent_type == BTRFS_FILE_EXTENT_INLINE) - len = btrfs_file_extent_inline_len(path->nodes[0], fi); - else - BUG(); - - return len; -} - static int __btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { - u32 sum; + u32 sum[16]; + int len; struct bio_vec *bvec = bio->bi_io_vec; int bio_index = 0; u64 offset = 0; @@ -186,7 +166,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, u64 disk_bytenr; u32 diff; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - int ret; + int count; struct btrfs_path *path; struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -214,10 +194,12 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, if (dio) offset = logical_offset; while (bio_index < bio->bi_vcnt) { + len = min_t(int, ARRAY_SIZE(sum), bio->bi_vcnt - bio_index); if (!dio) offset = page_offset(bvec->bv_page) + bvec->bv_offset; - ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); - if (ret == 0) + count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, sum, + len); + if (count) goto found; if (!item || disk_bytenr < item_start_offset || @@ -230,10 +212,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, item = btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, disk_bytenr, 0); if (IS_ERR(item)) { - ret = PTR_ERR(item); - if (ret == -ENOENT || ret == -EFBIG) - ret = 0; - sum = 0; + count = 1; + sum[0] = 0; if (BTRFS_I(inode)->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { set_extent_bits(io_tree, offset, @@ -269,19 +249,29 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, diff = disk_bytenr - item_start_offset; diff = diff / root->sectorsize; diff = diff * csum_size; - - read_extent_buffer(path->nodes[0], &sum, + count = min_t(int, len, (item_last_offset - disk_bytenr) >> + inode->i_sb->s_blocksize_bits); + read_extent_buffer(path->nodes[0], sum, ((unsigned long)item) + diff, - csum_size); + csum_size * count); found: - if (dst) - *dst++ = sum; - else - set_state_private(io_tree, offset, sum); - disk_bytenr += bvec->bv_len; - offset += bvec->bv_len; - bio_index++; - bvec++; + if (dst) { + memcpy(dst, sum, count * csum_size); + dst += count; + } else { + if (dio) + extent_cache_csums_dio(io_tree, offset, sum, + count); + else + extent_cache_csums(io_tree, bio, bio_index, sum, + count); + } + while (count--) { + disk_bytenr += bvec->bv_len; + offset += bvec->bv_len; + bio_index++; + bvec++; + } } btrfs_free_path(path); return 0; @@ -306,7 +296,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_csum_item *item; LIST_HEAD(tmplist); unsigned long offset; @@ -358,11 +347,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || - key.type != BTRFS_EXTENT_CSUM_KEY) - break; - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.offset > end) + key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset > end) break; if (key.offset > start) @@ -380,34 +366,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_csum_item); while (start < csum_end) { size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(root)); + MAX_ORDERED_SUM_BYTES(root)); sums = kzalloc(btrfs_ordered_sum_size(root, size), - GFP_NOFS); + GFP_NOFS); if (!sums) { ret = -ENOMEM; goto fail; } - sector_sum = sums->sums; sums->bytenr = start; - sums->len = size; + sums->len = (int)size; offset = (start - key.offset) >> root->fs_info->sb->s_blocksize_bits; offset *= csum_size; + size >>= root->fs_info->sb->s_blocksize_bits; - while (size > 0) { - read_extent_buffer(path->nodes[0], - §or_sum->sum, - ((unsigned long)item) + - offset, csum_size); - sector_sum->bytenr = start; - - size -= root->sectorsize; - start += root->sectorsize; - offset += csum_size; - sector_sum++; - } + read_extent_buffer(path->nodes[0], + sums->sums, + ((unsigned long)item) + offset, + csum_size * size); + + start += root->sectorsize * size; list_add_tail(&sums->list, &tmplist); } path->slots[0]++; @@ -429,23 +409,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_ordered_extent *ordered; char *data; struct bio_vec *bvec = bio->bi_io_vec; int bio_index = 0; + int index; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; u64 offset; - u64 disk_bytenr; WARN_ON(bio->bi_vcnt <= 0); sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); if (!sums) return -ENOMEM; - sector_sum = sums->sums; - disk_bytenr = (u64)bio->bi_sector << 9; sums->len = bio->bi_size; INIT_LIST_HEAD(&sums->list); @@ -456,7 +433,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ - sums->bytenr = ordered->start; + sums->bytenr = (u64)bio->bi_sector << 9; + index = 0; while (bio_index < bio->bi_vcnt) { if (!contig) @@ -475,29 +453,27 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), GFP_NOFS); BUG_ON(!sums); /* -ENOMEM */ - sector_sum = sums->sums; sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ - sums->bytenr = ordered->start; + sums->bytenr = ((u64)bio->bi_sector << 9) + + total_bytes; + index = 0; } data = kmap_atomic(bvec->bv_page); - sector_sum->sum = ~(u32)0; - sector_sum->sum = btrfs_csum_data(root, - data + bvec->bv_offset, - sector_sum->sum, - bvec->bv_len); + sums->sums[index] = ~(u32)0; + sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset, + sums->sums[index], + bvec->bv_len); kunmap_atomic(data); - btrfs_csum_final(sector_sum->sum, - (char *)§or_sum->sum); - sector_sum->bytenr = disk_bytenr; + btrfs_csum_final(sums->sums[index], + (char *)(sums->sums + index)); - sector_sum++; bio_index++; + index++; total_bytes += bvec->bv_len; this_sum_bytes += bvec->bv_len; - disk_bytenr += bvec->bv_len; offset += bvec->bv_len; bvec++; } @@ -518,8 +494,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, * This calls btrfs_truncate_item with the correct args based on the * overlap, and fixes up the key as required. */ -static noinline void truncate_one_csum(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static noinline void truncate_one_csum(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, u64 bytenr, u64 len) @@ -544,7 +519,7 @@ static noinline void truncate_one_csum(struct btrfs_trans_handle *trans, */ u32 new_size = (bytenr - key->offset) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(trans, root, path, new_size, 1); + btrfs_truncate_item(root, path, new_size, 1); } else if (key->offset >= bytenr && csum_end > end_byte && end_byte > key->offset) { /* @@ -556,10 +531,10 @@ static noinline void truncate_one_csum(struct btrfs_trans_handle *trans, u32 new_size = (csum_end - end_byte) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(trans, root, path, new_size, 0); + btrfs_truncate_item(root, path, new_size, 0); key->offset = end_byte; - btrfs_set_item_key_safe(trans, root, path, key); + btrfs_set_item_key_safe(root, path, key); } else { BUG(); } @@ -674,7 +649,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, key.offset = end_byte - 1; } else { - truncate_one_csum(trans, root, path, &key, bytenr, len); + truncate_one_csum(root, path, &key, bytenr, len); if (key.offset < bytenr) break; } @@ -686,62 +661,46 @@ out: return ret; } -static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums, - struct btrfs_sector_sum *sector_sum, - u64 total_bytes, u64 sectorsize) -{ - u64 tmp = sectorsize; - u64 next_sector = sector_sum->bytenr; - struct btrfs_sector_sum *next = sector_sum + 1; - - while ((tmp + total_bytes) < sums->len) { - if (next_sector + sectorsize != next->bytenr) - break; - tmp += sectorsize; - next_sector = next->bytenr; - next++; - } - return tmp; -} - int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums) { - u64 bytenr; - int ret; struct btrfs_key file_key; struct btrfs_key found_key; - u64 next_offset; - u64 total_bytes = 0; - int found_next; struct btrfs_path *path; struct btrfs_csum_item *item; struct btrfs_csum_item *item_end; struct extent_buffer *leaf = NULL; + u64 next_offset; + u64 total_bytes = 0; u64 csum_offset; - struct btrfs_sector_sum *sector_sum; + u64 bytenr; u32 nritems; u32 ins_size; + int index = 0; + int found_next; + int ret; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - - sector_sum = sums->sums; again: next_offset = (u64)-1; found_next = 0; + bytenr = sums->bytenr + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = sector_sum->bytenr; - bytenr = sector_sum->bytenr; + file_key.offset = bytenr; btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); - item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); + item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { - leaf = path->nodes[0]; ret = 0; + leaf = path->nodes[0]; + item_end = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_csum_item); + item_end = (struct btrfs_csum_item *)((char *)item_end + + btrfs_item_size_nr(leaf, path->slots[0])); goto found; } ret = PTR_ERR(item); @@ -821,8 +780,7 @@ again: free_space = btrfs_leaf_free_space(root, leaf) - sizeof(struct btrfs_item) - csum_size; - tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, - root->sectorsize); + tmp = sums->len - total_bytes; tmp >>= root->fs_info->sb->s_blocksize_bits; WARN_ON(tmp < 1); @@ -835,7 +793,8 @@ again: diff /= csum_size; diff *= csum_size; - btrfs_extend_item(trans, root, path, diff); + btrfs_extend_item(root, path, diff); + ret = 0; goto csum; } @@ -845,8 +804,7 @@ insert: if (found_next) { u64 tmp; - tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, - root->sectorsize); + tmp = sums->len - total_bytes; tmp >>= root->fs_info->sb->s_blocksize_bits; tmp = min(tmp, (next_offset - file_key.offset) >> root->fs_info->sb->s_blocksize_bits); @@ -867,31 +825,25 @@ insert: WARN_ON(1); goto fail_unlock; } -csum: leaf = path->nodes[0]; +csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); - ret = 0; + item_end = (struct btrfs_csum_item *)((unsigned char *)item + + btrfs_item_size_nr(leaf, path->slots[0])); item = (struct btrfs_csum_item *)((unsigned char *)item + csum_offset * csum_size); found: - item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); - item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + - btrfs_item_size_nr(leaf, path->slots[0])); -next_sector: - - write_extent_buffer(leaf, §or_sum->sum, (unsigned long)item, csum_size); - - total_bytes += root->sectorsize; - sector_sum++; - if (total_bytes < sums->len) { - item = (struct btrfs_csum_item *)((char *)item + - csum_size); - if (item < item_end && bytenr + PAGE_CACHE_SIZE == - sector_sum->bytenr) { - bytenr = sector_sum->bytenr; - goto next_sector; - } - } + ins_size = (u32)(sums->len - total_bytes) >> + root->fs_info->sb->s_blocksize_bits; + ins_size *= csum_size; + ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item, + ins_size); + write_extent_buffer(leaf, sums->sums + index, (unsigned long)item, + ins_size); + + ins_size /= csum_size; + total_bytes += ins_size * root->sectorsize; + index += ins_size; btrfs_mark_buffer_dirty(path->nodes[0]); if (total_bytes < sums->len) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ade03e6f7bd2..a005fe2c072a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,6 +24,7 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mpage.h> +#include <linux/aio.h> #include <linux/falloc.h> #include <linux/swap.h> #include <linux/writeback.h> @@ -192,8 +193,8 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, * the same inode in the tree, we will merge them together (by * __btrfs_add_inode_defrag()) and free the one that we want to requeue. */ -void btrfs_requeue_inode_defrag(struct inode *inode, - struct inode_defrag *defrag) +static void btrfs_requeue_inode_defrag(struct inode *inode, + struct inode_defrag *defrag) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; @@ -308,10 +309,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, ret = PTR_ERR(inode_root); goto cleanup; } - if (btrfs_root_refs(&inode_root->root_item) == 0) { - ret = -ENOENT; - goto cleanup; - } key.objectid = defrag->ino; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); @@ -473,7 +470,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, /* * unlocks pages after btrfs_file_write is done with them */ -void btrfs_drop_pages(struct page **pages, size_t num_pages) +static void btrfs_drop_pages(struct page **pages, size_t num_pages) { size_t i; for (i = 0; i < num_pages; i++) { @@ -497,9 +494,9 @@ void btrfs_drop_pages(struct page **pages, size_t num_pages) * doing real data extents, marking pages dirty and delalloc as required. */ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, - struct page **pages, size_t num_pages, - loff_t pos, size_t write_bytes, - struct extent_state **cached) + struct page **pages, size_t num_pages, + loff_t pos, size_t write_bytes, + struct extent_state **cached) { int err = 0; int i; @@ -552,6 +549,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int testend = 1; unsigned long flags; int compressed = 0; + bool modified; WARN_ON(end < start); if (end == (u64)-1) { @@ -561,6 +559,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, while (1) { int no_splits = 0; + modified = false; if (!split) split = alloc_extent_map(); if (!split2) @@ -592,6 +591,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); clear_bit(EXTENT_FLAG_PINNED, &em->flags); clear_bit(EXTENT_FLAG_LOGGING, &flags); + modified = !list_empty(&em->list); remove_extent_mapping(em_tree, em); if (no_splits) goto next; @@ -607,15 +607,15 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->block_len = em->block_len; else split->block_len = split->len; + split->ram_bytes = em->ram_bytes; split->orig_block_len = max(split->block_len, em->orig_block_len); split->generation = gen; split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; - ret = add_extent_mapping(em_tree, split); + ret = add_extent_mapping(em_tree, split, modified); BUG_ON(ret); /* Logic error */ - list_move(&split->list, &em_tree->modified_extents); free_extent_map(split); split = split2; split2 = NULL; @@ -632,6 +632,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->generation = gen; split->orig_block_len = max(em->block_len, em->orig_block_len); + split->ram_bytes = em->ram_bytes; if (compressed) { split->block_len = em->block_len; @@ -643,9 +644,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->orig_start = em->orig_start; } - ret = add_extent_mapping(em_tree, split); + ret = add_extent_mapping(em_tree, split, modified); BUG_ON(ret); /* Logic error */ - list_move(&split->list, &em_tree->modified_extents); free_extent_map(split); split = NULL; } @@ -821,7 +821,7 @@ next_slot: memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = end; - btrfs_set_item_key_safe(trans, root, path, &new_key); + btrfs_set_item_key_safe(root, path, &new_key); extent_offset += end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); @@ -1037,7 +1037,7 @@ again: ino, bytenr, orig_offset, &other_start, &other_end)) { new_key.offset = end; - btrfs_set_item_key_safe(trans, root, path, &new_key); + btrfs_set_item_key_safe(root, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, @@ -1071,7 +1071,7 @@ again: trans->transid); path->slots[0]++; new_key.offset = start; - btrfs_set_item_key_safe(trans, root, path, &new_key); + btrfs_set_item_key_safe(root, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -1313,6 +1313,56 @@ fail: } +static noinline int check_can_nocow(struct inode *inode, loff_t pos, + size_t *write_bytes) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ordered_extent *ordered; + u64 lockstart, lockend; + u64 num_bytes; + int ret; + + lockstart = round_down(pos, root->sectorsize); + lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1; + + while (1) { + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (!ordered) { + break; + } + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + return PTR_ERR(trans); + } + + num_bytes = lockend - lockstart + 1; + ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL, + NULL); + btrfs_end_transaction(trans, root); + if (ret <= 0) { + ret = 0; + } else { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, + NULL, GFP_NOFS); + *write_bytes = min_t(size_t, *write_bytes, num_bytes); + } + + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + + return ret; +} + static noinline ssize_t __btrfs_buffered_write(struct file *file, struct iov_iter *i, loff_t pos) @@ -1320,10 +1370,12 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; + u64 release_bytes = 0; unsigned long first_index; size_t num_written = 0; int nrptrs; int ret = 0; + bool only_release_metadata = false; bool force_page_uptodate = false; nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / @@ -1344,6 +1396,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, offset); size_t num_pages = (write_bytes + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size_t reserve_bytes; size_t dirty_pages; size_t copied; @@ -1358,11 +1411,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, break; } - ret = btrfs_delalloc_reserve_space(inode, - num_pages << PAGE_CACHE_SHIFT); + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + ret = btrfs_check_data_free_space(inode, reserve_bytes); + if (ret == -ENOSPC && + (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC))) { + ret = check_can_nocow(inode, pos, &write_bytes); + if (ret > 0) { + only_release_metadata = true; + /* + * our prealloc extent may be smaller than + * write_bytes, so scale down. + */ + num_pages = (write_bytes + offset + + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + ret = 0; + } else { + ret = -ENOSPC; + } + } + if (ret) break; + ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); + if (ret) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, + reserve_bytes); + break; + } + + release_bytes = reserve_bytes; + /* * This is going to setup the pages array with the number of * pages we want, so we don't really need to worry about the @@ -1371,11 +1454,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, ret = prepare_pages(root, file, pages, num_pages, pos, first_index, write_bytes, force_page_uptodate); - if (ret) { - btrfs_delalloc_release_space(inode, - num_pages << PAGE_CACHE_SHIFT); + if (ret) break; - } copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, i); @@ -1405,30 +1485,46 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, * managed to copy. */ if (num_pages > dirty_pages) { + release_bytes = (num_pages - dirty_pages) << + PAGE_CACHE_SHIFT; if (copied > 0) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } - btrfs_delalloc_release_space(inode, - (num_pages - dirty_pages) << - PAGE_CACHE_SHIFT); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, + release_bytes); + else + btrfs_delalloc_release_space(inode, + release_bytes); } + release_bytes = dirty_pages << PAGE_CACHE_SHIFT; if (copied > 0) { ret = btrfs_dirty_pages(root, inode, pages, dirty_pages, pos, copied, NULL); if (ret) { - btrfs_delalloc_release_space(inode, - dirty_pages << PAGE_CACHE_SHIFT); btrfs_drop_pages(pages, num_pages); break; } } + release_bytes = 0; btrfs_drop_pages(pages, num_pages); + if (only_release_metadata && copied > 0) { + u64 lockstart = round_down(pos, root->sectorsize); + u64 lockend = lockstart + + (dirty_pages << PAGE_CACHE_SHIFT) - 1; + + set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_NORESERVE, NULL, + NULL, GFP_NOFS); + only_release_metadata = false; + } + cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); @@ -1441,6 +1537,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, kfree(pages); + if (release_bytes) { + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, release_bytes); + else + btrfs_delalloc_release_space(inode, release_bytes); + } + return num_written ? num_written : ret; } @@ -1514,8 +1617,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, size_t count, ocount; bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); - sb_start_write(inode->i_sb); - mutex_lock(&inode->i_mutex); err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); @@ -1617,7 +1718,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); out: - sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; } @@ -1885,7 +1985,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, path->slots[0]++; key.offset = offset; - btrfs_set_item_key_safe(trans, root, path, &key); + btrfs_set_item_key_safe(root, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - @@ -1915,6 +2015,7 @@ out: } else { hole_em->start = offset; hole_em->len = end - offset; + hole_em->ram_bytes = hole_em->len; hole_em->orig_start = offset; hole_em->block_start = EXTENT_MAP_HOLE; @@ -1927,10 +2028,7 @@ out: do { btrfs_drop_extent_cache(inode, offset, end - 1, 0); write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, hole_em); - if (!ret) - list_move(&hole_em->list, - &em_tree->modified_extents); + ret = add_extent_mapping(em_tree, hole_em, 1); write_unlock(&em_tree->lock); } while (ret == -EEXIST); free_extent_map(hole_em); @@ -2176,12 +2274,6 @@ static long btrfs_fallocate(struct file *file, int mode, goto out_reserve_fail; } - /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. - */ - btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); - mutex_lock(&inode->i_mutex); ret = inode_newsize_ok(inode, alloc_end); if (ret) @@ -2192,8 +2284,23 @@ static long btrfs_fallocate(struct file *file, int mode, alloc_start); if (ret) goto out; + } else { + /* + * If we are fallocating from the end of the file onward we + * need to zero out the end of the page if i_size lands in the + * middle of a page. + */ + ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); + if (ret) + goto out; } + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -2426,20 +2533,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) } } - if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) { - offset = -EINVAL; - goto out; - } - if (offset > inode->i_sb->s_maxbytes) { - offset = -EINVAL; - goto out; - } - - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: mutex_unlock(&inode->i_mutex); return offset; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 1f84fc09c1a8..b21a3cd667d8 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -104,7 +104,8 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, spin_lock(&block_group->lock); if (!((BTRFS_I(inode)->flags & flags) == flags)) { - printk(KERN_INFO "Old style space inode found, converting.\n"); + btrfs_info(root->fs_info, + "Old style space inode found, converting."); BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; block_group->disk_cache_state = BTRFS_DC_CLEAR; @@ -119,9 +120,10 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, return inode; } -int __create_free_space_inode(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, u64 ino, u64 offset) +static int __create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 ino, u64 offset) { struct btrfs_key key; struct btrfs_disk_key disk_key; @@ -195,30 +197,32 @@ int create_free_space_inode(struct btrfs_root *root, block_group->key.objectid); } -int btrfs_truncate_free_space_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct inode *inode) +int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) { - struct btrfs_block_rsv *rsv; u64 needed_bytes; - loff_t oldsize; - int ret = 0; - - rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + int ret; /* 1 for slack space, 1 for updating the inode */ needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + btrfs_calc_trans_metadata_size(root, 1); - spin_lock(&trans->block_rsv->lock); - if (trans->block_rsv->reserved < needed_bytes) { - spin_unlock(&trans->block_rsv->lock); - trans->block_rsv = rsv; - return -ENOSPC; - } - spin_unlock(&trans->block_rsv->lock); + spin_lock(&rsv->lock); + if (rsv->reserved < needed_bytes) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&rsv->lock); + return ret; +} + +int btrfs_truncate_free_space_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct inode *inode) +{ + loff_t oldsize; + int ret = 0; oldsize = i_size_read(inode); btrfs_i_size_write(inode, 0); @@ -230,9 +234,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); - if (ret) { - trans->block_rsv = rsv; btrfs_abort_transaction(trans, root, ret); return ret; } @@ -240,7 +242,6 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, ret = btrfs_update_inode(trans, root, inode); if (ret) btrfs_abort_transaction(trans, root, ret); - trans->block_rsv = rsv; return ret; } @@ -431,7 +432,7 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, + crc = btrfs_csum_data(io_ctl->orig + offset, crc, PAGE_CACHE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); io_ctl_unmap_page(io_ctl); @@ -461,7 +462,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) kunmap(io_ctl->pages[0]); io_ctl_map_page(io_ctl, 0); - crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, + crc = btrfs_csum_data(io_ctl->orig + offset, crc, PAGE_CACHE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); if (val != crc) { @@ -624,9 +625,9 @@ next: spin_unlock(&ctl->tree_lock); } -int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, - struct btrfs_free_space_ctl *ctl, - struct btrfs_path *path, u64 offset) +static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_path *path, u64 offset) { struct btrfs_free_space_header *header; struct extent_buffer *leaf; @@ -669,10 +670,11 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, btrfs_release_path(path); if (BTRFS_I(inode)->generation != generation) { - printk(KERN_ERR "btrfs: free space inode generation (%llu) did" - " not match free space cache generation (%llu)\n", - (unsigned long long)BTRFS_I(inode)->generation, - (unsigned long long)generation); + btrfs_err(root->fs_info, + "free space inode generation (%llu) " + "did not match free space cache generation (%llu)", + (unsigned long long)BTRFS_I(inode)->generation, + (unsigned long long)generation); return 0; } @@ -721,8 +723,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ret = link_free_space(ctl, e); spin_unlock(&ctl->tree_lock); if (ret) { - printk(KERN_ERR "Duplicate entries in " - "free space cache, dumping\n"); + btrfs_err(root->fs_info, + "Duplicate entries in free space cache, dumping"); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } @@ -741,8 +743,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ctl->op->recalc_thresholds(ctl); spin_unlock(&ctl->tree_lock); if (ret) { - printk(KERN_ERR "Duplicate entries in " - "free space cache, dumping\n"); + btrfs_err(root->fs_info, + "Duplicate entries in free space cache, dumping"); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } @@ -833,8 +835,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, if (!matched) { __btrfs_remove_free_space_cache(ctl); - printk(KERN_ERR "block group %llu has an wrong amount of free " - "space\n", block_group->key.objectid); + btrfs_err(fs_info, "block group %llu has wrong amount of free space", + block_group->key.objectid); ret = -1; } out: @@ -845,8 +847,8 @@ out: spin_unlock(&block_group->lock); ret = 0; - printk(KERN_ERR "btrfs: failed to load free space cache " - "for block group %llu\n", block_group->key.objectid); + btrfs_err(fs_info, "failed to load free space cache for block group %llu", + block_group->key.objectid); } iput(inode); @@ -866,11 +868,11 @@ out: * on mount. This will return 0 if it was successfull in writing the cache out, * and -1 if it was not. */ -int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, - struct btrfs_free_space_ctl *ctl, - struct btrfs_block_group_cache *block_group, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, u64 offset) +static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 offset) { struct btrfs_free_space_header *header; struct extent_buffer *leaf; @@ -917,10 +919,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* Make sure we can fit our crcs into the first page */ if (io_ctl.check_crcs && - (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) { - WARN_ON(1); + (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) goto out_nospc; - } io_ctl_set_generation(&io_ctl, trans->transid); @@ -1104,8 +1104,9 @@ int btrfs_write_out_cache(struct btrfs_root *root, spin_unlock(&block_group->lock); ret = 0; #ifdef DEBUG - printk(KERN_ERR "btrfs: failed to write free space cache " - "for block group %llu\n", block_group->key.objectid); + btrfs_err(root->fs_info, + "failed to write free space cache for block group %llu", + block_group->key.objectid); #endif } @@ -1564,7 +1565,8 @@ again: search_bytes = ctl->unit; search_bytes = min(search_bytes, end - search_start + 1); ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); - BUG_ON(ret < 0 || search_start != *offset); + if (ret < 0 || search_start != *offset) + return -EINVAL; /* We may have found more bits than what we need */ search_bytes = min(search_bytes, *bytes); @@ -1970,7 +1972,6 @@ again: re_search = true; goto again; } - BUG_ON(ret); /* logic error */ out_lock: spin_unlock(&ctl->tree_lock); out: @@ -2064,7 +2065,8 @@ out: return 0; } -void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl) +static void __btrfs_remove_free_space_cache_locked( + struct btrfs_free_space_ctl *ctl) { struct btrfs_free_space *info; struct rb_node *node; @@ -2931,8 +2933,9 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) ret = __load_free_space_cache(root, inode, ctl, path, 0); if (ret < 0) - printk(KERN_ERR "btrfs: failed to load free ino cache for " - "root %llu\n", root->root_key.objectid); + btrfs_err(fs_info, + "failed to load free ino cache for root %llu", + root->root_key.objectid); out_put: iput(inode); out: @@ -2959,11 +2962,534 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, if (ret) { btrfs_delalloc_release_metadata(inode, inode->i_size); #ifdef DEBUG - printk(KERN_ERR "btrfs: failed to write free ino cache " - "for root %llu\n", root->root_key.objectid); + btrfs_err(root->fs_info, + "failed to write free ino cache for root %llu", + root->root_key.objectid); #endif } iput(inode); return ret; } + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +static struct btrfs_block_group_cache *init_test_block_group(void) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = 0; + cache->key.offset = 1024 * 1024 * 1024; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = 4096; + + spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->new_bg_list); + + btrfs_init_free_space_ctl(cache); + + return cache; +} + +/* + * Checks to see if the given range is in the free space cache. This is really + * just used to check the absence of space, so if there is free space in the + * range at all we will return 1. + */ +static int check_exists(struct btrfs_block_group_cache *cache, u64 offset, + u64 bytes) +{ + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *info; + int ret = 0; + + spin_lock(&ctl->tree_lock); + info = tree_search_offset(ctl, offset, 0, 0); + if (!info) { + info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!info) + goto out; + } + +have_info: + if (info->bitmap) { + u64 bit_off, bit_bytes; + struct rb_node *n; + struct btrfs_free_space *tmp; + + bit_off = offset; + bit_bytes = ctl->unit; + ret = search_bitmap(ctl, info, &bit_off, &bit_bytes); + if (!ret) { + if (bit_off == offset) { + ret = 1; + goto out; + } else if (bit_off > offset && + offset + bytes > bit_off) { + ret = 1; + goto out; + } + } + + n = rb_prev(&info->offset_index); + while (n) { + tmp = rb_entry(n, struct btrfs_free_space, + offset_index); + if (tmp->offset + tmp->bytes < offset) + break; + if (offset + bytes < tmp->offset) { + n = rb_prev(&info->offset_index); + continue; + } + info = tmp; + goto have_info; + } + + n = rb_next(&info->offset_index); + while (n) { + tmp = rb_entry(n, struct btrfs_free_space, + offset_index); + if (offset + bytes < tmp->offset) + break; + if (tmp->offset + tmp->bytes < offset) { + n = rb_next(&info->offset_index); + continue; + } + info = tmp; + goto have_info; + } + + goto out; + } + + if (info->offset == offset) { + ret = 1; + goto out; + } + + if (offset > info->offset && offset < info->offset + info->bytes) + ret = 1; +out: + spin_unlock(&ctl->tree_lock); + return ret; +} + +/* + * Use this if you need to make a bitmap or extent entry specifically, it + * doesn't do any of the merging that add_free_space does, this acts a lot like + * how the free space cache loading stuff works, so you can get really weird + * configurations. + */ +static int add_free_space_entry(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes, bool bitmap) +{ + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *info = NULL, *bitmap_info; + void *map = NULL; + u64 bytes_added; + int ret; + +again: + if (!info) { + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); + if (!info) + return -ENOMEM; + } + + if (!bitmap) { + spin_lock(&ctl->tree_lock); + info->offset = offset; + info->bytes = bytes; + ret = link_free_space(ctl, info); + spin_unlock(&ctl->tree_lock); + if (ret) + kmem_cache_free(btrfs_free_space_cachep, info); + return ret; + } + + if (!map) { + map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!map) { + kmem_cache_free(btrfs_free_space_cachep, info); + return -ENOMEM; + } + } + + spin_lock(&ctl->tree_lock); + bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!bitmap_info) { + info->bitmap = map; + map = NULL; + add_new_bitmap(ctl, info, offset); + bitmap_info = info; + } + + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + spin_unlock(&ctl->tree_lock); + + if (bytes) + goto again; + + if (map) + kfree(map); + return 0; +} + +#define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__) + +/* + * This test just does basic sanity checking, making sure we can add an exten + * entry and remove space from either end and the middle, and make sure we can + * remove space that covers adjacent extent entries. + */ +static int test_extents(struct btrfs_block_group_cache *cache) +{ + int ret = 0; + + test_msg("Running extent only tests\n"); + + /* First just make sure we can remove an entire entry */ + ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error adding initial extents %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error removing extent %d\n", ret); + return ret; + } + + if (check_exists(cache, 0, 4 * 1024 * 1024)) { + test_msg("Full remove left some lingering space\n"); + return -1; + } + + /* Ok edge and middle cases now */ + ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error adding half extent %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); + if (ret) { + test_msg("Error removing tail end %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + if (ret) { + test_msg("Error removing front end %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); + if (ret) { + test_msg("Error removing middle piece %d\n", ret); + return ret; + } + + if (check_exists(cache, 0, 1 * 1024 * 1024)) { + test_msg("Still have space at the front\n"); + return -1; + } + + if (check_exists(cache, 2 * 1024 * 1024, 4096)) { + test_msg("Still have space in the middle\n"); + return -1; + } + + if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { + test_msg("Still have space at the end\n"); + return -1; + } + + /* Cleanup */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + return 0; +} + +static int test_bitmaps(struct btrfs_block_group_cache *cache) +{ + u64 next_bitmap_offset; + int ret; + + test_msg("Running bitmap only tests\n"); + + ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't create a bitmap entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error removing bitmap full range %d\n", ret); + return ret; + } + + if (check_exists(cache, 0, 4 * 1024 * 1024)) { + test_msg("Left some space in bitmap\n"); + return -1; + } + + ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add to our bitmap entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove middle chunk %d\n", ret); + return ret; + } + + /* + * The first bitmap we have starts at offset 0 so the next one is just + * at the end of the first bitmap. + */ + next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + + /* Test a bit straddling two bitmaps */ + ret = add_free_space_entry(cache, next_bitmap_offset - + (2 * 1024 * 1024), 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add space that straddles two bitmaps %d\n", + ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, next_bitmap_offset - + (1 * 1024 * 1024), 2 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove overlapping space %d\n", ret); + return ret; + } + + if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), + 2 * 1024 * 1024)) { + test_msg("Left some space when removing overlapping\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + return 0; +} + +/* This is the high grade jackassery */ +static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) +{ + u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + int ret; + + test_msg("Running bitmap and extent tests\n"); + + /* + * First let's do something simple, an extent at the same offset as the + * bitmap, but the free space completely in the extent and then + * completely in the bitmap. + */ + ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't create bitmap entry %d\n", ret); + return ret; + } + + ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove extent entry %d\n", ret); + return ret; + } + + if (check_exists(cache, 0, 1 * 1024 * 1024)) { + test_msg("Left remnants after our remove\n"); + return -1; + } + + /* Now to add back the extent entry and remove from the bitmap */ + ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't re-add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove from bitmap %d\n", ret); + return ret; + } + + if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { + test_msg("Left remnants in the bitmap\n"); + return -1; + } + + /* + * Ok so a little more evil, extent entry and bitmap at the same offset, + * removing an overlapping chunk. + */ + ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add to a bitmap %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove overlapping space %d\n", ret); + return ret; + } + + if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { + test_msg("Left over peices after removing overlapping\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + /* Now with the extent entry offset into the bitmap */ + ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add space to the bitmap %d\n", ret); + return ret; + } + + ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent to the cache %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); + if (ret) { + test_msg("Problem removing overlapping space %d\n", ret); + return ret; + } + + if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { + test_msg("Left something behind when removing space"); + return -1; + } + + /* + * This has blown up in the past, the extent entry starts before the + * bitmap entry, but we're trying to remove an offset that falls + * completely within the bitmap range and is in both the extent entry + * and the bitmap entry, looks like this + * + * [ extent ] + * [ bitmap ] + * [ del ] + */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, + 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap %d\n", ret); + return ret; + } + + ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, + 5 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, + 5 * 1024 * 1024); + if (ret) { + test_msg("Failed to free our space %d\n", ret); + return ret; + } + + if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024, + 5 * 1024 * 1024)) { + test_msg("Left stuff over\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + /* + * This blew up before, we have part of the free space in a bitmap and + * then the entirety of the rest of the space in an extent. This used + * to return -EAGAIN back from btrfs_remove_extent, make sure this + * doesn't happen. + */ + ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap entry %d\n", ret); + return ret; + } + + ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); + if (ret) { + test_msg("Error removing bitmap and extent overlapping %d\n", ret); + return ret; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + return 0; +} + +void btrfs_test_free_space_cache(void) +{ + struct btrfs_block_group_cache *cache; + + test_msg("Running btrfs free space cache tests\n"); + + cache = init_test_block_group(); + if (!cache) { + test_msg("Couldn't run the tests\n"); + return; + } + + if (test_extents(cache)) + goto out; + if (test_bitmaps(cache)) + goto out; + if (test_bitmaps_and_extents(cache)) + goto out; +out: + __btrfs_remove_free_space_cache(cache->free_space_ctl); + kfree(cache->free_space_ctl); + kfree(cache); + test_msg("Free space cache tests finished\n"); +} +#undef test_msg +#else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ +void btrfs_test_free_space_cache(void) {} +#endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 8f2613f779ed..894116b71304 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -54,6 +54,8 @@ int create_free_space_inode(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); +int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, struct btrfs_path *path, @@ -110,4 +112,7 @@ int btrfs_return_cluster_to_free_space( struct btrfs_free_cluster *cluster); int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen); + +void btrfs_test_free_space_cache(void); + #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 48b8fda93132..e0b7034d6343 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -183,10 +183,11 @@ int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans, return -ENOENT; } -int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 *index) +static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, + u64 *index) { struct btrfs_path *path; struct btrfs_key key; @@ -246,7 +247,7 @@ int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, memmove_extent_buffer(leaf, ptr, ptr + del_len, item_size - (ptr + del_len - item_start)); - btrfs_truncate_item(trans, root, path, item_size - del_len, 1); + btrfs_truncate_item(root, path, item_size - del_len, 1); out: btrfs_free_path(path); @@ -309,7 +310,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); - btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1); + btrfs_truncate_item(root, path, item_size - sub_item_len, 1); out: btrfs_free_path(path); @@ -361,7 +362,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, name, name_len, NULL)) goto out; - btrfs_extend_item(trans, root, path, ins_len); + btrfs_extend_item(root, path, ins_len); ret = 0; } if (ret < 0) @@ -417,7 +418,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, goto out; old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - btrfs_extend_item(trans, root, path, ins_len); + btrfs_extend_item(root, path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d26f67a59e36..2c66ddbbe670 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -429,11 +429,12 @@ int btrfs_save_ino_cache(struct btrfs_root *root, num_bytes = trans->bytes_reserved; /* * 1 item for inode item insertion if need - * 3 items for inode item update (in the worst case) + * 4 items for inode item update (in the worst case) + * 1 items for slack space if we need do truncation * 1 item for free space object * 3 items for pre-allocation */ - trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); + trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10); ret = btrfs_block_rsv_add(root, trans->block_rsv, trans->bytes_reserved, BTRFS_RESERVE_NO_FLUSH); @@ -468,7 +469,8 @@ again: if (i_size_read(inode) > 0) { ret = btrfs_truncate_free_space_cache(root, trans, path, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + if (ret != -ENOSPC) + btrfs_abort_transaction(trans, root, ret); goto out_put; } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 09c58a35b429..6d1b93c8aafb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,6 +32,7 @@ #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/compat.h> +#include <linux/aio.h> #include <linux/bit_spinlock.h> #include <linux/xattr.h> #include <linux/posix_acl.h> @@ -41,6 +42,7 @@ #include <linux/mount.h> #include <linux/btrfs.h> #include <linux/blkdev.h> +#include <linux/posix_acl_xattr.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -56,6 +58,7 @@ #include "free-space-cache.h" #include "inode-map.h" #include "backref.h" +#include "hash.h" struct btrfs_iget_args { u64 ino; @@ -100,7 +103,10 @@ static noinline int cow_file_range(struct inode *inode, static struct extent_map *create_pinned_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, - u64 orig_block_len, int type); + u64 orig_block_len, u64 ram_bytes, + int type); + +static int btrfs_dirty_inode(struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, @@ -697,8 +703,12 @@ retry: async_extent->nr_pages = 0; async_extent->pages = NULL; - if (ret == -ENOSPC) + if (ret == -ENOSPC) { + unlock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1); goto retry; + } goto out_free; } @@ -711,8 +721,10 @@ retry: async_extent->ram_size - 1, 0); em = alloc_extent_map(); - if (!em) + if (!em) { + ret = -ENOMEM; goto out_free_reserve; + } em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; @@ -722,6 +734,7 @@ retry: em->block_start = ins.objectid; em->block_len = ins.offset; em->orig_block_len = ins.offset; + em->ram_bytes = async_extent->ram_size; em->bdev = root->fs_info->fs_devices->latest_bdev; em->compress_type = async_extent->compress_type; set_bit(EXTENT_FLAG_PINNED, &em->flags); @@ -730,10 +743,7 @@ retry: while (1) { write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - if (!ret) - list_move(&em->list, - &em_tree->modified_extents); + ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -921,7 +931,10 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, } em = alloc_extent_map(); - BUG_ON(!em); /* -ENOMEM */ + if (!em) { + ret = -ENOMEM; + goto out_reserve; + } em->start = start; em->orig_start = em->start; ram_size = ins.offset; @@ -932,16 +945,14 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, em->block_start = ins.objectid; em->block_len = ins.offset; em->orig_block_len = ins.offset; + em->ram_bytes = ram_size; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); em->generation = -1; while (1) { write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - if (!ret) - list_move(&em->list, - &em_tree->modified_extents); + ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -950,11 +961,14 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); } + if (ret) + goto out_reserve; cur_alloc_size = ins.offset; ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + goto out_reserve; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { @@ -962,7 +976,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, cur_alloc_size); if (ret) { btrfs_abort_transaction(trans, root, ret); - goto out_unlock; + goto out_reserve; } } @@ -991,6 +1005,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, out: return ret; +out_reserve: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset); out_unlock: extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, @@ -1194,6 +1210,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 disk_bytenr; u64 num_bytes; u64 disk_num_bytes; + u64 ram_bytes; int extent_type; int ret, err; int type; @@ -1290,6 +1307,7 @@ next_slot: struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); + ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); @@ -1373,6 +1391,7 @@ out_check: em->block_len = num_bytes; em->block_start = disk_bytenr; em->orig_block_len = disk_num_bytes; + em->ram_bytes = ram_bytes; em->bdev = root->fs_info->fs_devices->latest_bdev; em->mod_start = em->start; em->mod_len = em->len; @@ -1381,10 +1400,7 @@ out_check: em->generation = -1; while (1) { write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - if (!ret) - list_move(&em->list, - &em_tree->modified_extents); + ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -1519,13 +1535,53 @@ static void btrfs_merge_extent_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } +static void btrfs_add_delalloc_inodes(struct btrfs_root *root, + struct inode *inode) +{ + spin_lock(&root->delalloc_lock); + if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + root->nr_delalloc_inodes++; + if (root->nr_delalloc_inodes == 1) { + spin_lock(&root->fs_info->delalloc_root_lock); + BUG_ON(!list_empty(&root->delalloc_root)); + list_add_tail(&root->delalloc_root, + &root->fs_info->delalloc_roots); + spin_unlock(&root->fs_info->delalloc_root_lock); + } + } + spin_unlock(&root->delalloc_lock); +} + +static void btrfs_del_delalloc_inode(struct btrfs_root *root, + struct inode *inode) +{ + spin_lock(&root->delalloc_lock); + if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + root->nr_delalloc_inodes--; + if (!root->nr_delalloc_inodes) { + spin_lock(&root->fs_info->delalloc_root_lock); + BUG_ON(list_empty(&root->delalloc_root)); + list_del_init(&root->delalloc_root); + spin_unlock(&root->fs_info->delalloc_root_lock); + } + } + spin_unlock(&root->delalloc_lock); +} + /* * extent_io.c set_bit_hook, used to track delayed allocation * bytes in this file, and to maintain the list of inodes that * have pending delalloc work to be done. */ static void btrfs_set_bit_hook(struct inode *inode, - struct extent_state *state, int *bits) + struct extent_state *state, unsigned long *bits) { /* @@ -1551,16 +1607,8 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_add_tail(&BTRFS_I(inode)->delalloc_inodes, - &root->fs_info->delalloc_inodes); - set_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags); - } - spin_unlock(&root->fs_info->delalloc_lock); - } + &BTRFS_I(inode)->runtime_flags)) + btrfs_add_delalloc_inodes(root, inode); spin_unlock(&BTRFS_I(inode)->lock); } } @@ -1569,7 +1617,8 @@ static void btrfs_set_bit_hook(struct inode *inode, * extent_io.c clear_bit_hook, see set_bit_hook for why */ static void btrfs_clear_bit_hook(struct inode *inode, - struct extent_state *state, int *bits) + struct extent_state *state, + unsigned long *bits) { /* * set_bit and clear bit hooks normally require _irqsave/restore @@ -1593,7 +1642,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, btrfs_delalloc_release_metadata(inode, len); if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list) + && do_list && !(state->state & EXTENT_NORESERVE)) btrfs_free_reserved_data_space(inode, len); __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, @@ -1602,15 +1651,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, BTRFS_I(inode)->delalloc_bytes -= len; if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_del_init(&BTRFS_I(inode)->delalloc_inodes); - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags); - } - spin_unlock(&root->fs_info->delalloc_lock); - } + &BTRFS_I(inode)->runtime_flags)) + btrfs_del_delalloc_inode(root, inode); spin_unlock(&BTRFS_I(inode)->lock); } } @@ -2252,11 +2294,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path, return 0; return PTR_ERR(root); } - if (btrfs_root_refs(&root->root_item) == 0) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - /* parse ENOENT to 0 */ - return 0; - } /* step 2: get inode */ key.objectid = backref->inum; @@ -2793,6 +2830,8 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, int ret; struct btrfs_root *root = BTRFS_I(inode)->root; u32 csum = ~(u32)0; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (PageChecked(page)) { ClearPageChecked(page); @@ -2819,7 +2858,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, if (ret) goto zeroit; - csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); + csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1); btrfs_csum_final(csum, (char *)&csum); if (csum != private) goto zeroit; @@ -2829,11 +2868,11 @@ good: return 0; zeroit: - printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u " - "private %llu\n", - (unsigned long long)btrfs_ino(page->mapping->host), - (unsigned long long)start, csum, - (unsigned long long)private); + if (__ratelimit(&_rs)) + btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u private %llu", + (unsigned long long)btrfs_ino(page->mapping->host), + (unsigned long long)start, csum, + (unsigned long long)private); memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page); kunmap_atomic(kaddr); @@ -3019,7 +3058,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) * We have done the truncate/delete so we can go ahead and remove the orphan * item for this particular inode. */ -int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) +static int btrfs_orphan_del(struct btrfs_trans_handle *trans, + struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; int delete_item = 0; @@ -3114,8 +3154,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) */ if (found_key.offset == last_objectid) { - printk(KERN_ERR "btrfs: Error removing orphan entry, " - "stopping orphan cleanup\n"); + btrfs_err(root->fs_info, + "Error removing orphan entry, stopping orphan cleanup"); ret = -EINVAL; goto out; } @@ -3172,8 +3212,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = PTR_ERR(trans); goto out; } - printk(KERN_ERR "auto deleting %Lu\n", - found_key.objectid); + btrfs_debug(root->fs_info, "auto deleting %Lu", + found_key.objectid); ret = btrfs_del_orphan_item(trans, root, found_key.objectid); BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ @@ -3201,13 +3241,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* 1 for the orphan item deletion. */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { + iput(inode); ret = PTR_ERR(trans); goto out; } ret = btrfs_orphan_add(trans, inode); btrfs_end_transaction(trans, root); - if (ret) + if (ret) { + iput(inode); goto out; + } ret = btrfs_truncate(inode); if (ret) @@ -3237,13 +3280,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) } if (nr_unlink) - printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); + btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink); if (nr_truncate) - printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); + btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate); out: if (ret) - printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret); + btrfs_crit(root->fs_info, + "could not do orphan cleanup %d", ret); btrfs_free_path(path); return ret; } @@ -3259,8 +3303,17 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; + static u64 xattr_access = 0; + static u64 xattr_default = 0; int scanned = 0; + if (!xattr_access) { + xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS)); + xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT)); + } + slot++; while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -3270,8 +3323,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, return 0; /* we found an xattr, assume we've got an acl */ - if (found_key.type == BTRFS_XATTR_ITEM_KEY) - return 1; + if (found_key.type == BTRFS_XATTR_ITEM_KEY) { + if (found_key.offset == xattr_access || + found_key.offset == xattr_default) + return 1; + } /* * we found a key greater than an xattr key, there can't @@ -3591,9 +3647,10 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, dir_ino, &index); if (ret) { - printk(KERN_INFO "btrfs failed to delete reference to %.*s, " - "inode %llu parent %llu\n", name_len, name, - (unsigned long long)ino, (unsigned long long)dir_ino); + btrfs_info(root->fs_info, + "failed to delete reference to %.*s, inode %llu parent %llu", + name_len, name, + (unsigned long long)ino, (unsigned long long)dir_ino); btrfs_abort_transaction(trans, root, ret); goto err; } @@ -3615,6 +3672,8 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, dir, index); if (ret == -ENOENT) ret = 0; + else if (ret) + btrfs_abort_transaction(trans, root, ret); err: btrfs_free_path(path); if (ret) @@ -3642,53 +3701,20 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, } return ret; } - - -/* helper to check if there is any shared block in the path */ -static int check_path_shared(struct btrfs_root *root, - struct btrfs_path *path) -{ - struct extent_buffer *eb; - int level; - u64 refs = 1; - - for (level = 0; level < BTRFS_MAX_LEVEL; level++) { - int ret; - - if (!path->nodes[level]) - break; - eb = path->nodes[level]; - if (!btrfs_block_can_be_shared(root, eb)) - continue; - ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, - &refs, NULL); - if (refs > 1) - return 1; - } - return 0; -} /* * helper to start transaction for unlink and rmdir. * - * unlink and rmdir are special in btrfs, they do not always free space. - * so in enospc case, we should make sure they will free space before - * allowing them to use the global metadata reservation. + * unlink and rmdir are special in btrfs, they do not always free space, so + * if we cannot make our reservations the normal way try and see if there is + * plenty of slack room in the global reserve to migrate, otherwise we cannot + * allow the unlink to occur. */ -static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, - struct dentry *dentry) +static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_path *path; - struct btrfs_dir_item *di; - struct inode *inode = dentry->d_inode; - u64 index; - int check_link = 1; - int err = -ENOSPC; int ret; - u64 ino = btrfs_ino(inode); - u64 dir_ino = btrfs_ino(dir); /* * 1 for the possible orphan item @@ -3701,158 +3727,23 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; - if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) - return ERR_PTR(-ENOSPC); - - /* check if there is someone else holds reference */ - if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) - return ERR_PTR(-ENOSPC); - - if (atomic_read(&inode->i_count) > 2) - return ERR_PTR(-ENOSPC); - - if (xchg(&root->fs_info->enospc_unlink, 1)) - return ERR_PTR(-ENOSPC); - - path = btrfs_alloc_path(); - if (!path) { - root->fs_info->enospc_unlink = 0; - return ERR_PTR(-ENOMEM); - } - - /* 1 for the orphan item */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - btrfs_free_path(path); - root->fs_info->enospc_unlink = 0; - return trans; - } - - path->skip_locking = 1; - path->search_commit_root = 1; - - ret = btrfs_lookup_inode(trans, root, path, - &BTRFS_I(dir)->location, 0); - if (ret < 0) { - err = ret; - goto out; - } - if (ret == 0) { - if (check_path_shared(root, path)) - goto out; - } else { - check_link = 0; - } - btrfs_release_path(path); - - ret = btrfs_lookup_inode(trans, root, path, - &BTRFS_I(inode)->location, 0); - if (ret < 0) { - err = ret; - goto out; - } - if (ret == 0) { - if (check_path_shared(root, path)) - goto out; - } else { - check_link = 0; - } - btrfs_release_path(path); + if (PTR_ERR(trans) == -ENOSPC) { + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); - if (ret == 0 && S_ISREG(inode->i_mode)) { - ret = btrfs_lookup_file_extent(trans, root, path, - ino, (u64)-1, 0); - if (ret < 0) { - err = ret; - goto out; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return trans; + ret = btrfs_cond_migrate_bytes(root->fs_info, + &root->fs_info->trans_block_rsv, + num_bytes, 5); + if (ret) { + btrfs_end_transaction(trans, root); + return ERR_PTR(ret); } - BUG_ON(ret == 0); /* Corruption */ - if (check_path_shared(root, path)) - goto out; - btrfs_release_path(path); - } - - if (!check_link) { - err = 0; - goto out; - } - - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, - dentry->d_name.name, dentry->d_name.len, 0); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto out; - } - if (di) { - if (check_path_shared(root, path)) - goto out; - } else { - err = 0; - goto out; - } - btrfs_release_path(path); - - ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name, - dentry->d_name.len, ino, dir_ino, 0, - &index); - if (ret) { - err = ret; - goto out; - } - - if (check_path_shared(root, path)) - goto out; - - btrfs_release_path(path); - - /* - * This is a commit root search, if we can lookup inode item and other - * relative items in the commit root, it means the transaction of - * dir/file creation has been committed, and the dir index item that we - * delay to insert has also been inserted into the commit root. So - * we needn't worry about the delayed insertion of the dir index item - * here. - */ - di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index, - dentry->d_name.name, dentry->d_name.len, 0); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto out; - } - BUG_ON(ret == -ENOENT); - if (check_path_shared(root, path)) - goto out; - - err = 0; -out: - btrfs_free_path(path); - /* Migrate the orphan reservation over */ - if (!err) - err = btrfs_block_rsv_migrate(trans->block_rsv, - &root->fs_info->global_block_rsv, - trans->bytes_reserved); - - if (err) { - btrfs_end_transaction(trans, root); - root->fs_info->enospc_unlink = 0; - return ERR_PTR(err); - } - - trans->block_rsv = &root->fs_info->global_block_rsv; - return trans; -} - -static void __unlink_end_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) { - btrfs_block_rsv_release(root, trans->block_rsv, - trans->bytes_reserved); trans->block_rsv = &root->fs_info->trans_block_rsv; - BUG_ON(!root->fs_info->enospc_unlink); - root->fs_info->enospc_unlink = 0; + trans->bytes_reserved = num_bytes; } - btrfs_end_transaction(trans, root); + return trans; } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -3862,7 +3753,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; int ret; - trans = __unlink_start_trans(dir, dentry); + trans = __unlink_start_trans(dir); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3880,7 +3771,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) } out: - __unlink_end_trans(trans, root); + btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); return ret; } @@ -3977,7 +3868,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) return -EPERM; - trans = __unlink_start_trans(dir, dentry); + trans = __unlink_start_trans(dir); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3999,7 +3890,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (!err) btrfs_i_size_write(inode, 0); out: - __unlink_end_trans(trans, root); + btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); return err; @@ -4175,8 +4066,7 @@ search_again: } size = btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(trans, root, path, - size, 1); + btrfs_truncate_item(root, path, size, 1); } else if (root->ref_cows) { inode_sub_bytes(inode, item_end + 1 - found_key.offset); @@ -4378,6 +4268,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) u64 hole_size; int err = 0; + /* + * If our size started in the middle of a page we need to zero out the + * rest of the page before we expand the i_size, otherwise we could + * expose stale data. + */ + err = btrfs_truncate_page(inode, oldsize, 0, 0); + if (err) + return err; + if (size <= hole_start) return 0; @@ -4450,16 +4349,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; hole_em->orig_block_len = 0; + hole_em->ram_bytes = hole_size; hole_em->bdev = root->fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; while (1) { write_lock(&em_tree->lock); - err = add_extent_mapping(em_tree, hole_em); - if (!err) - list_move(&hole_em->list, - &em_tree->modified_extents); + err = add_extent_mapping(em_tree, hole_em, 1); write_unlock(&em_tree->lock); if (err != -EEXIST) break; @@ -4670,8 +4567,9 @@ void btrfs_evict_inode(struct inode *inode) ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); if (ret) { - printk(KERN_WARNING "Could not get space for a " - "delete, will truncate on mount %d\n", ret); + btrfs_warn(root->fs_info, + "Could not get space for a delete, will truncate on mount %d", + ret); btrfs_orphan_del(NULL, inode); btrfs_free_block_rsv(root, rsv); goto no_delete; @@ -4712,6 +4610,7 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); no_delete: + btrfs_remove_delayed_node(inode); clear_inode(inode); return; } @@ -4805,11 +4704,6 @@ static int fixup_tree_root_location(struct btrfs_root *root, goto out; } - if (btrfs_root_refs(&new_root->root_item) == 0) { - err = -ENOENT; - goto out; - } - *sub_root = new_root; location->objectid = btrfs_root_dirid(&new_root->root_item); location->type = BTRFS_INODE_ITEM_KEY; @@ -4827,14 +4721,13 @@ static void inode_tree_add(struct inode *inode) struct rb_node **p; struct rb_node *parent; u64 ino = btrfs_ino(inode); -again: - p = &root->inode_tree.rb_node; - parent = NULL; if (inode_unhashed(inode)) return; - +again: + parent = NULL; spin_lock(&root->inode_lock); + p = &root->inode_tree.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct btrfs_inode, rb_node); @@ -5076,8 +4969,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (!(inode->i_sb->s_flags & MS_RDONLY)) ret = btrfs_orphan_cleanup(sub_root); up_read(&root->fs_info->cleanup_work_sem); - if (ret) + if (ret) { + iput(inode); inode = ERR_PTR(ret); + } } return inode; @@ -5121,10 +5016,9 @@ unsigned char btrfs_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int btrfs_real_readdir(struct file *filp, void *dirent, - filldir_t filldir) +static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_item *item; struct btrfs_dir_item *di; @@ -5145,29 +5039,15 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, char tmp_name[32]; char *name_ptr; int name_len; - int is_curr = 0; /* filp->f_pos points to the current index? */ + int is_curr = 0; /* ctx->pos points to the current index? */ /* FIXME, use a real flag for deciding about the key type */ if (root->fs_info->tree_root == root) key_type = BTRFS_DIR_ITEM_KEY; - /* special case for "." */ - if (filp->f_pos == 0) { - over = filldir(dirent, ".", 1, - filp->f_pos, btrfs_ino(inode), DT_DIR); - if (over) - return 0; - filp->f_pos = 1; - } - /* special case for .., just use the back ref */ - if (filp->f_pos == 1) { - u64 pino = parent_ino(filp->f_path.dentry); - over = filldir(dirent, "..", 2, - filp->f_pos, pino, DT_DIR); - if (over) - return 0; - filp->f_pos = 2; - } + if (!dir_emit_dots(file, ctx)) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -5181,7 +5061,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, } btrfs_set_key_type(&key, key_type); - key.offset = filp->f_pos; + key.offset = ctx->pos; key.objectid = btrfs_ino(inode); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -5207,14 +5087,14 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, break; if (btrfs_key_type(&found_key) != key_type) break; - if (found_key.offset < filp->f_pos) + if (found_key.offset < ctx->pos) goto next; if (key_type == BTRFS_DIR_INDEX_KEY && btrfs_should_delete_dir_index(&del_list, found_key.offset)) goto next; - filp->f_pos = found_key.offset; + ctx->pos = found_key.offset; is_curr = 1; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); @@ -5258,9 +5138,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, over = 0; goto skip; } - over = filldir(dirent, name_ptr, name_len, - found_key.offset, location.objectid, - d_type); + over = !dir_emit(ctx, name_ptr, name_len, + location.objectid, d_type); skip: if (name_ptr != tmp_name) @@ -5279,9 +5158,8 @@ next: if (key_type == BTRFS_DIR_INDEX_KEY) { if (is_curr) - filp->f_pos++; - ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir, - &ins_list); + ctx->pos++; + ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); if (ret) goto nopos; } @@ -5292,9 +5170,9 @@ next: * 32-bit glibc will use getdents64, but then strtol - * so the last number we can serve is this. */ - filp->f_pos = 0x7fffffff; + ctx->pos = 0x7fffffff; else - filp->f_pos++; + ctx->pos++; nopos: ret = 0; err: @@ -5335,7 +5213,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) * FIXME, needs more benchmarking...there are no reasons other than performance * to keep or drop this code. */ -int btrfs_dirty_inode(struct inode *inode) +static int btrfs_dirty_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -5977,7 +5855,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree, em->block_start += start_diff; em->block_len -= start_diff; } - return add_extent_mapping(em_tree, em); + return add_extent_mapping(em_tree, em, 0); } static noinline int uncompress_inline(struct btrfs_path *path, @@ -6151,6 +6029,7 @@ again: goto not_found_em; } + em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { em->start = extent_start; @@ -6259,18 +6138,18 @@ not_found_em: insert: btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { - printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " - "[%llu %llu]\n", (unsigned long long)em->start, - (unsigned long long)em->len, - (unsigned long long)start, - (unsigned long long)len); + btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", + (unsigned long long)em->start, + (unsigned long long)em->len, + (unsigned long long)start, + (unsigned long long)len); err = -EIO; goto out; } err = 0; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); + ret = add_extent_mapping(em_tree, em, 0); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that * an overlapping map exists in the tree @@ -6482,7 +6361,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, } em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, - ins.offset, ins.offset, 0); + ins.offset, ins.offset, ins.offset, 0); if (IS_ERR(em)) goto out; @@ -6501,8 +6380,10 @@ out: * returns 1 when the nocow is safe, < 1 on error, 0 if the * block must be cow'd */ -static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, - struct inode *inode, u64 offset, u64 len) +noinline int can_nocow_extent(struct btrfs_trans_handle *trans, + struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes) { struct btrfs_path *path; int ret; @@ -6516,7 +6397,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, u64 num_bytes; int slot; int found_type; - + bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -6556,15 +6437,29 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, /* not a regular extent, must cow */ goto out; } + + if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) + goto out; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - backref_offset = btrfs_file_extent_offset(leaf, fi); + if (disk_bytenr == 0) + goto out; - extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (extent_end < offset + len) { - /* extent doesn't include our full range, must cow */ + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) goto out; + + backref_offset = btrfs_file_extent_offset(leaf, fi); + + if (orig_start) { + *orig_start = key.offset - backref_offset; + *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); } + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if (btrfs_extent_readonly(root, disk_bytenr)) goto out; @@ -6584,13 +6479,14 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, */ disk_bytenr += backref_offset; disk_bytenr += offset - key.offset; - num_bytes = min(offset + len, extent_end) - offset; + num_bytes = min(offset + *len, extent_end) - offset; if (csum_exist_in_range(root, disk_bytenr, num_bytes)) goto out; /* * all of the above have passed, it is safe to overwrite this extent * without cow */ + *len = num_bytes; ret = 1; out: btrfs_free_path(path); @@ -6661,7 +6557,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, static struct extent_map *create_pinned_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, - u64 orig_block_len, int type) + u64 orig_block_len, u64 ram_bytes, + int type) { struct extent_map_tree *em_tree; struct extent_map *em; @@ -6682,6 +6579,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->block_start = block_start; em->bdev = root->fs_info->fs_devices->latest_bdev; em->orig_block_len = orig_block_len; + em->ram_bytes = ram_bytes; em->generation = -1; set_bit(EXTENT_FLAG_PINNED, &em->flags); if (type == BTRFS_ORDERED_PREALLOC) @@ -6691,10 +6589,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, btrfs_drop_extent_cache(inode, em->start, em->start + em->len - 1, 0); write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - if (!ret) - list_move(&em->list, - &em_tree->modified_extents); + ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); } while (ret == -EEXIST); @@ -6789,7 +6684,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, em->block_start != EXTENT_MAP_HOLE)) { int type; int ret; - u64 block_start; + u64 block_start, orig_start, orig_block_len, ram_bytes; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) type = BTRFS_ORDERED_PREALLOC; @@ -6807,16 +6702,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (IS_ERR(trans)) goto must_cow; - if (can_nocow_odirect(trans, inode, start, len) == 1) { - u64 orig_start = em->orig_start; - u64 orig_block_len = em->orig_block_len; - + if (can_nocow_extent(trans, inode, start, &len, &orig_start, + &orig_block_len, &ram_bytes) == 1) { if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); em = create_pinned_em(inode, start, len, orig_start, block_start, len, - orig_block_len, type); + orig_block_len, + ram_bytes, type); if (IS_ERR(em)) { btrfs_end_transaction(trans, root); goto unlock_err; @@ -6910,7 +6804,11 @@ struct btrfs_dio_private { /* IO errors */ int errors; + /* orig_bio is our btrfs_io_bio */ struct bio *orig_bio; + + /* dio_bio came from fs/direct-io.c */ + struct bio *dio_bio; }; static void btrfs_endio_direct_read(struct bio *bio, int err) @@ -6920,6 +6818,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) struct bio_vec *bvec = bio->bi_io_vec; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; + struct bio *dio_bio; u64 start; start = dip->logical_offset; @@ -6936,7 +6835,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) goto failed; local_irq_save(flags); kaddr = kmap_atomic(page); - csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, + csum = btrfs_csum_data(kaddr + bvec->bv_offset, csum, bvec->bv_len); btrfs_csum_final(csum, (char *)&csum); kunmap_atomic(kaddr); @@ -6945,11 +6844,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) flush_dcache_page(bvec->bv_page); if (csum != private) { failed: - printk(KERN_ERR "btrfs csum failed ino %llu off" - " %llu csum %u private %u\n", - (unsigned long long)btrfs_ino(inode), - (unsigned long long)start, - csum, (unsigned)private); + btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u private %u", + (unsigned long long)btrfs_ino(inode), + (unsigned long long)start, + csum, (unsigned)private); err = -EIO; } } @@ -6960,14 +6858,15 @@ failed: unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); - bio->bi_private = dip->private; + dio_bio = dip->dio_bio; kfree(dip); /* If we had a csum failure make sure to clear the uptodate flag */ if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - dio_end_io(bio, err); + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + dio_end_io(dio_bio, err); + bio_put(bio); } static void btrfs_endio_direct_write(struct bio *bio, int err) @@ -6978,6 +6877,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct btrfs_ordered_extent *ordered = NULL; u64 ordered_offset = dip->logical_offset; u64 ordered_bytes = dip->bytes; + struct bio *dio_bio; int ret; if (err) @@ -7005,14 +6905,15 @@ out_test: goto again; } out_done: - bio->bi_private = dip->private; + dio_bio = dip->dio_bio; kfree(dip); /* If we had an error make sure to clear the uptodate flag */ if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - dio_end_io(bio, err); + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + dio_end_io(dio_bio, err); + bio_put(bio); } static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, @@ -7048,10 +6949,10 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) if (!atomic_dec_and_test(&dip->pending_bios)) goto out; - if (dip->errors) + if (dip->errors) { bio_io_error(dip->orig_bio); - else { - set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); + } else { + set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags); bio_endio(dip->orig_bio, 0); } out: @@ -7226,48 +7127,54 @@ out_err: return 0; } -static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, - loff_t file_offset) +static void btrfs_submit_direct(int rw, struct bio *dio_bio, + struct inode *inode, loff_t file_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_dio_private *dip; - struct bio_vec *bvec = bio->bi_io_vec; + struct bio *io_bio; int skip_sum; int write = rw & REQ_WRITE; int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); + + if (!io_bio) { + ret = -ENOMEM; + goto free_ordered; + } + dip = kmalloc(sizeof(*dip), GFP_NOFS); if (!dip) { ret = -ENOMEM; - goto free_ordered; + goto free_io_bio; } - dip->private = bio->bi_private; + dip->private = dio_bio->bi_private; dip->inode = inode; dip->logical_offset = file_offset; - - dip->bytes = 0; - do { - dip->bytes += bvec->bv_len; - bvec++; - } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); - - dip->disk_bytenr = (u64)bio->bi_sector << 9; - bio->bi_private = dip; + dip->bytes = dio_bio->bi_size; + dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; + io_bio->bi_private = dip; dip->errors = 0; - dip->orig_bio = bio; + dip->orig_bio = io_bio; + dip->dio_bio = dio_bio; atomic_set(&dip->pending_bios, 0); if (write) - bio->bi_end_io = btrfs_endio_direct_write; + io_bio->bi_end_io = btrfs_endio_direct_write; else - bio->bi_end_io = btrfs_endio_direct_read; + io_bio->bi_end_io = btrfs_endio_direct_read; ret = btrfs_submit_direct_hook(rw, dip, skip_sum); if (!ret) return; + +free_io_bio: + bio_put(io_bio); + free_ordered: /* * If this is a write, we need to clean up the reserved space and kill @@ -7283,7 +7190,7 @@ free_ordered: btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } - bio_endio(bio, ret); + bio_endio(dio_bio, ret); } static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, @@ -7347,8 +7254,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, atomic_inc(&inode->i_dio_count); smp_mb__after_atomic_inc(); + /* + * The generic stuff only does filemap_write_and_wait_range, which isn't + * enough if we've written compressed pages to this area, so we need to + * call btrfs_wait_ordered_range to make absolutely sure that any + * outstanding dirty pages are on disk. + */ + count = iov_length(iov, nr_segs); + btrfs_wait_ordered_range(inode, offset, count); + if (rw & WRITE) { - count = iov_length(iov, nr_segs); /* * If the write DIO is beyond the EOF, we need update * the isize, but it is protected by i_mutex. So we can @@ -7425,8 +7340,8 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc) return extent_write_full_page(tree, page, btrfs_get_extent, wbc); } -int btrfs_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { struct extent_io_tree *tree; @@ -7467,7 +7382,8 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); } -static void btrfs_invalidatepage(struct page *page, unsigned long offset) +static void btrfs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode = page->mapping->host; struct extent_io_tree *tree; @@ -7667,16 +7583,12 @@ static int btrfs_truncate(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; - int ret; + int ret = 0; int err = 0; struct btrfs_trans_handle *trans; u64 mask = root->sectorsize - 1; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); - ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); - if (ret) - return ret; - btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); @@ -7934,15 +7846,15 @@ void btrfs_destroy_inode(struct inode *inode) */ smp_mb(); if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); list_del_init(&BTRFS_I(inode)->ordered_operations); - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); } if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags)) { - printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", - (unsigned long long)btrfs_ino(inode)); + btrfs_info(root->fs_info, "inode %llu still on the orphan list", + (unsigned long long)btrfs_ino(inode)); atomic_dec(&root->orphan_inodes); } @@ -7951,10 +7863,9 @@ void btrfs_destroy_inode(struct inode *inode) if (!ordered) break; else { - printk(KERN_ERR "btrfs found ordered " - "extent %llu %llu on inode cleanup\n", - (unsigned long long)ordered->file_offset, - (unsigned long long)ordered->len); + btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", + (unsigned long long)ordered->file_offset, + (unsigned long long)ordered->len); btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -7963,7 +7874,6 @@ void btrfs_destroy_inode(struct inode *inode) inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); free: - btrfs_remove_delayed_node(inode); call_rcu(&inode->i_rcu, btrfs_i_callback); } @@ -7971,6 +7881,9 @@ int btrfs_drop_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + if (root == NULL) + return 1; + /* the snap/subvol tree is on deleting */ if (btrfs_root_refs(&root->root_item) == 0 && root != root->fs_info->tree_root) @@ -8305,7 +8218,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { struct btrfs_inode *binode; struct inode *inode; @@ -8314,30 +8227,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) struct list_head splice; int ret = 0; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - INIT_LIST_HEAD(&works); INIT_LIST_HEAD(&splice); - spin_lock(&root->fs_info->delalloc_lock); - list_splice_init(&root->fs_info->delalloc_inodes, &splice); + spin_lock(&root->delalloc_lock); + list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { binode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); - list_del_init(&binode->delalloc_inodes); - + list_move_tail(&binode->delalloc_inodes, + &root->delalloc_inodes); inode = igrab(&binode->vfs_inode); if (!inode) { - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &binode->runtime_flags); + cond_resched_lock(&root->delalloc_lock); continue; } - - list_add_tail(&binode->delalloc_inodes, - &root->fs_info->delalloc_inodes); - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&root->delalloc_lock); work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); if (unlikely(!work)) { @@ -8349,16 +8255,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) &work->work); cond_resched(); - spin_lock(&root->fs_info->delalloc_lock); + spin_lock(&root->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&root->delalloc_lock); list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); } + return 0; +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + + if (!list_empty_careful(&splice)) { + spin_lock(&root->delalloc_lock); + list_splice_tail(&splice, &root->delalloc_inodes); + spin_unlock(&root->delalloc_lock); + } + return ret; +} + +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +{ + int ret; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; - /* the filemap_flush will queue IO into the worker threads, but + ret = __start_delalloc_inodes(root, delay_iput); + /* + * the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return */ @@ -8370,17 +8299,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); - return 0; -out: - list_for_each_entry_safe(work, next, &works, list) { - list_del_init(&work->list); - btrfs_wait_and_free_delalloc_work(work); + return ret; +} + +int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info, + int delay_iput) +{ + struct btrfs_root *root; + struct list_head splice; + int ret; + + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->delalloc_root_lock); + list_splice_init(&fs_info->delalloc_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + delalloc_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + list_move_tail(&root->delalloc_root, + &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); + + ret = __start_delalloc_inodes(root, delay_iput); + btrfs_put_fs_root(root); + if (ret) + goto out; + + spin_lock(&fs_info->delalloc_root_lock); } + spin_unlock(&fs_info->delalloc_root_lock); + atomic_inc(&fs_info->async_submit_draining); + while (atomic_read(&fs_info->nr_async_submits) || + atomic_read(&fs_info->async_delalloc_pages)) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) == 0 && + atomic_read(&fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&fs_info->async_submit_draining); + return 0; +out: if (!list_empty_careful(&splice)) { - spin_lock(&root->fs_info->delalloc_lock); - list_splice_tail(&splice, &root->fs_info->delalloc_inodes); - spin_unlock(&root->fs_info->delalloc_lock); + spin_lock(&fs_info->delalloc_root_lock); + list_splice_tail(&splice, &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); } return ret; } @@ -8571,16 +8538,14 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->block_start = ins.objectid; em->block_len = ins.offset; em->orig_block_len = ins.offset; + em->ram_bytes = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PREALLOC, &em->flags); em->generation = trans->transid; while (1) { write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - if (!ret) - list_move(&em->list, - &em_tree->modified_extents); + ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); if (ret != -EEXIST) break; @@ -8689,7 +8654,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = { static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = btrfs_real_readdir, + .iterate = btrfs_real_readdir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2c02310ff2d9..238a05545ee2 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!root->ref_cows) return -EINVAL; + ret = btrfs_start_delalloc_inodes(root, 0); + if (ret) + return ret; + + btrfs_wait_ordered_extents(root, 0); + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) return -ENOMEM; @@ -723,7 +729,9 @@ static noinline int btrfs_mksubvol(struct path *parent, struct dentry *dentry; int error; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); + if (error == -EINTR) + return error; dentry = lookup_one_len(name, parent->dentry, namelen); error = PTR_ERR(dentry); @@ -1152,8 +1160,11 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, u64 new_align = ~((u64)128 * 1024 - 1); struct page **pages = NULL; - if (extent_thresh == 0) - extent_thresh = 256 * 1024; + if (isize == 0) + return 0; + + if (range->start >= isize) + return -EINVAL; if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { if (range->compress_type > BTRFS_COMPRESS_TYPES) @@ -1162,8 +1173,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, compress_type = range->compress_type; } - if (isize == 0) - return 0; + if (extent_thresh == 0) + extent_thresh = 256 * 1024; /* * if we were not given a file, allocate a readahead @@ -1796,7 +1807,11 @@ static noinline int copy_to_sk(struct btrfs_root *root, item_off = btrfs_item_ptr_offset(leaf, i); item_len = btrfs_item_size_nr(leaf, i); - if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE) + btrfs_item_key_to_cpu(leaf, key, i); + if (!key_in_sk(key, sk)) + continue; + + if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE) item_len = 0; if (sizeof(sh) + item_len + *sk_offset > @@ -1805,10 +1820,6 @@ static noinline int copy_to_sk(struct btrfs_root *root, goto overflow; } - btrfs_item_key_to_cpu(leaf, key, i); - if (!key_in_sk(key, sk)) - continue; - sh.objectid = key->objectid; sh.offset = key->offset; sh.type = key->type; @@ -2086,7 +2097,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); + if (err == -EINTR) + goto out; dentry = lookup_one_len(vol_args->name, parent, namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -2347,14 +2360,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - mnt_drop_write_file(file); - return -EINVAL; - } - - mutex_lock(&root->fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -2362,12 +2367,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = btrfs_rm_device(root, vol_args->name); - kfree(vol_args); -out: + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out; + } + + mutex_lock(&root->fs_info->volume_mutex); + ret = btrfs_rm_device(root, vol_args->name); mutex_unlock(&root->fs_info->volume_mutex); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + +out: + kfree(vol_args); mnt_drop_write_file(file); return ret; } @@ -2425,7 +2438,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) mutex_lock(&fs_devices->device_list_mutex); dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL); - mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { ret = -ENODEV; @@ -2449,6 +2461,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) } out: + mutex_unlock(&fs_devices->device_list_mutex); if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) ret = -EFAULT; @@ -2473,6 +2486,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, int ret; u64 len = olen; u64 bs = root->fs_info->sb->s_blocksize; + int same_inode = 0; /* * TODO: @@ -2509,7 +2523,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, ret = -EINVAL; if (src == inode) - goto out_fput; + same_inode = 1; /* the src must be open for reading */ if (!(src_file.file->f_mode & FMODE_READ)) @@ -2540,12 +2554,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, } path->reada = 2; - if (inode < src) { - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); + if (!same_inode) { + if (inode < src) { + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); + } else { + mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + } } else { - mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + mutex_lock(&src->i_mutex); } /* determine range to clone */ @@ -2563,6 +2581,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, !IS_ALIGNED(destoff, bs)) goto out_unlock; + /* verify if ranges are overlapped within the same file */ + if (same_inode) { + if (destoff + len > off && destoff < off + len) + goto out_unlock; + } + if (destoff > inode->i_size) { ret = btrfs_cont_expand(inode, inode->i_size, destoff); if (ret) @@ -2839,7 +2863,8 @@ out: unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); out_unlock: mutex_unlock(&src->i_mutex); - mutex_unlock(&inode->i_mutex); + if (!same_inode) + mutex_unlock(&inode->i_mutex); vfree(buf); btrfs_free_path(path); out_fput: @@ -2944,11 +2969,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) goto out; } - if (btrfs_root_refs(&new_root->root_item) == 0) { - ret = -ENOENT; - goto out; - } - path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -3003,7 +3023,7 @@ void btrfs_get_block_group_info(struct list_head *groups_list, } } -long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_space_args space_args; struct btrfs_ioctl_space_info space; @@ -3693,12 +3713,11 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) goto drop_write; } - if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { - trans = btrfs_start_transaction(root, 2); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } + down_write(&root->fs_info->subvol_sem); + trans = btrfs_start_transaction(root->fs_info->tree_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; } switch (sa->cmd) { @@ -3708,24 +3727,17 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) case BTRFS_QUOTA_CTL_DISABLE: ret = btrfs_quota_disable(trans, root->fs_info); break; - case BTRFS_QUOTA_CTL_RESCAN: - ret = btrfs_quota_rescan(root->fs_info); - break; default: ret = -EINVAL; break; } - if (copy_to_user(arg, sa, sizeof(*sa))) - ret = -EFAULT; - - if (trans) { - err = btrfs_commit_transaction(trans, root); - if (err && !ret) - ret = err; - } + err = btrfs_commit_transaction(trans, root->fs_info->tree_root); + if (err && !ret) + ret = err; out: kfree(sa); + up_write(&root->fs_info->subvol_sem); drop_write: mnt_drop_write_file(file); return ret; @@ -3877,6 +3889,74 @@ drop_write: return ret; } +static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_quota_rescan_args *qsa; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + qsa = memdup_user(arg, sizeof(*qsa)); + if (IS_ERR(qsa)) { + ret = PTR_ERR(qsa); + goto drop_write; + } + + if (qsa->flags) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_qgroup_rescan(root->fs_info); + +out: + kfree(qsa); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_quota_rescan_args *qsa; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + qsa = kzalloc(sizeof(*qsa), GFP_NOFS); + if (!qsa) + return -ENOMEM; + + if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + qsa->flags = 1; + qsa->progress = root->fs_info->qgroup_rescan_progress.objectid; + } + + if (copy_to_user(arg, qsa, sizeof(*qsa))) + ret = -EFAULT; + + kfree(qsa); + return ret; +} + +static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btrfs_qgroup_wait_for_completion(root->fs_info); +} + static long btrfs_ioctl_set_received_subvol(struct file *file, void __user *arg) { @@ -3960,7 +4040,7 @@ out: static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; const char *label = root->fs_info->super_copy->label; size_t len = strnlen(label, BTRFS_LABEL_SIZE); int ret; @@ -3979,7 +4059,7 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_super_block *super_block = root->fs_info->super_copy; struct btrfs_trans_handle *trans; char label[BTRFS_LABEL_SIZE]; @@ -4115,6 +4195,12 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_qgroup_create(file, argp); case BTRFS_IOC_QGROUP_LIMIT: return btrfs_ioctl_qgroup_limit(file, argp); + case BTRFS_IOC_QUOTA_RESCAN: + return btrfs_ioctl_quota_rescan(file, argp); + case BTRFS_IOC_QUOTA_RESCAN_STATUS: + return btrfs_ioctl_quota_rescan_status(file, argp); + case BTRFS_IOC_QUOTA_RESCAN_WAIT: + return btrfs_ioctl_quota_rescan_wait(file, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(root, argp); case BTRFS_IOC_GET_FSLABEL: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index e95df435d897..01277b8f2373 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -24,7 +24,7 @@ #include "extent_io.h" #include "locking.h" -void btrfs_assert_tree_read_locked(struct extent_buffer *eb); +static void btrfs_assert_tree_read_locked(struct extent_buffer *eb); /* * if we currently have a spinning reader or writer lock @@ -264,7 +264,7 @@ void btrfs_assert_tree_locked(struct extent_buffer *eb) BUG_ON(!atomic_read(&eb->write_locks)); } -void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { BUG_ON(!atomic_read(&eb->read_locks)); } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 743b86fa4fcb..f93151a98886 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -31,8 +31,8 @@ struct workspace { void *mem; - void *buf; /* where compressed data goes */ - void *cbuf; /* where decompressed data goes */ + void *buf; /* where decompressed data goes */ + void *cbuf; /* where compressed data goes */ struct list_head list; }; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 005c45db699e..81369827e514 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -24,6 +24,7 @@ #include "transaction.h" #include "btrfs_inode.h" #include "extent_io.h" +#include "disk-io.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -184,6 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type, int dio, int compress_type) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry; @@ -227,10 +229,18 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, ordered_data_tree_panic(inode, -EEXIST, file_offset); spin_unlock_irq(&tree->lock); - spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + spin_lock(&root->ordered_extent_lock); list_add_tail(&entry->root_extent_list, - &BTRFS_I(inode)->root->fs_info->ordered_extents); - spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + &root->ordered_extents); + root->nr_ordered_extents++; + if (root->nr_ordered_extents == 1) { + spin_lock(&root->fs_info->ordered_root_lock); + BUG_ON(!list_empty(&root->ordered_root)); + list_add_tail(&root->ordered_root, + &root->fs_info->ordered_roots); + spin_unlock(&root->fs_info->ordered_root_lock); + } + spin_unlock(&root->ordered_extent_lock); return 0; } @@ -516,8 +526,9 @@ void btrfs_remove_ordered_extent(struct inode *inode, set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); spin_unlock_irq(&tree->lock); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); + root->nr_ordered_extents--; trace_btrfs_ordered_extent_remove(inode, entry); @@ -530,7 +541,14 @@ void btrfs_remove_ordered_extent(struct inode *inode, !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { list_del_init(&BTRFS_I(inode)->ordered_operations); } - spin_unlock(&root->fs_info->ordered_extent_lock); + + if (!root->nr_ordered_extents) { + spin_lock(&root->fs_info->ordered_root_lock); + BUG_ON(list_empty(&root->ordered_root)); + list_del_init(&root->ordered_root); + spin_unlock(&root->fs_info->ordered_root_lock); + } + spin_unlock(&root->ordered_extent_lock); wake_up(&entry->wait); } @@ -550,7 +568,6 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) { struct list_head splice, works; - struct list_head *cur; struct btrfs_ordered_extent *ordered, *next; struct inode *inode; @@ -558,35 +575,34 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&works); mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_extent_lock); - list_splice_init(&root->fs_info->ordered_extents, &splice); + spin_lock(&root->ordered_extent_lock); + list_splice_init(&root->ordered_extents, &splice); while (!list_empty(&splice)) { - cur = splice.next; - ordered = list_entry(cur, struct btrfs_ordered_extent, - root_extent_list); - list_del_init(&ordered->root_extent_list); - atomic_inc(&ordered->refs); - + ordered = list_first_entry(&splice, struct btrfs_ordered_extent, + root_extent_list); + list_move_tail(&ordered->root_extent_list, + &root->ordered_extents); /* * the inode may be getting freed (in sys_unlink path). */ inode = igrab(ordered->inode); + if (!inode) { + cond_resched_lock(&root->ordered_extent_lock); + continue; + } - spin_unlock(&root->fs_info->ordered_extent_lock); + atomic_inc(&ordered->refs); + spin_unlock(&root->ordered_extent_lock); - if (inode) { - ordered->flush_work.func = btrfs_run_ordered_extent_work; - list_add_tail(&ordered->work_list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &ordered->flush_work); - } else { - btrfs_put_ordered_extent(ordered); - } + ordered->flush_work.func = btrfs_run_ordered_extent_work; + list_add_tail(&ordered->work_list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &ordered->flush_work); cond_resched(); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->ordered_extent_lock); } - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->ordered_extent_lock); list_for_each_entry_safe(ordered, next, &works, work_list) { list_del_init(&ordered->work_list); @@ -604,6 +620,33 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) mutex_unlock(&root->fs_info->ordered_operations_mutex); } +void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info, + int delay_iput) +{ + struct btrfs_root *root; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->ordered_root_lock); + list_splice_init(&fs_info->ordered_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + ordered_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + list_move_tail(&root->ordered_root, + &fs_info->ordered_roots); + spin_unlock(&fs_info->ordered_root_lock); + + btrfs_wait_ordered_extents(root, delay_iput); + btrfs_put_fs_root(root); + + spin_lock(&fs_info->ordered_root_lock); + } + spin_unlock(&fs_info->ordered_root_lock); +} + /* * this is used during transaction commit to write all the inodes * added to the ordered operation list. These files must be fully on @@ -629,7 +672,7 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&works); mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); list_splice_init(&cur_trans->ordered_operations, &splice); while (!list_empty(&splice)) { btrfs_inode = list_entry(splice.next, struct btrfs_inode, @@ -648,17 +691,17 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, if (!wait) list_add_tail(&BTRFS_I(inode)->ordered_operations, &cur_trans->ordered_operations); - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); work = btrfs_alloc_delalloc_work(inode, wait, 1); if (!work) { - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) list_add_tail(&btrfs_inode->ordered_operations, &splice); list_splice_tail(&splice, &cur_trans->ordered_operations); - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); ret = -ENOMEM; goto out; } @@ -667,9 +710,9 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, &work->work); cond_resched(); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); } - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); @@ -986,39 +1029,42 @@ out: * be reclaimed before their checksum is actually put into the btree */ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u32 *sum) + u32 *sum, int len) { struct btrfs_ordered_sum *ordered_sum; - struct btrfs_sector_sum *sector_sums; struct btrfs_ordered_extent *ordered; struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; unsigned long num_sectors; unsigned long i; u32 sectorsize = BTRFS_I(inode)->root->sectorsize; - int ret = 1; + int index = 0; ordered = btrfs_lookup_ordered_extent(inode, offset); if (!ordered) - return 1; + return 0; spin_lock_irq(&tree->lock); list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { - if (disk_bytenr >= ordered_sum->bytenr) { - num_sectors = ordered_sum->len / sectorsize; - sector_sums = ordered_sum->sums; - for (i = 0; i < num_sectors; i++) { - if (sector_sums[i].bytenr == disk_bytenr) { - *sum = sector_sums[i].sum; - ret = 0; - goto out; - } - } + if (disk_bytenr >= ordered_sum->bytenr && + disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { + i = (disk_bytenr - ordered_sum->bytenr) >> + inode->i_sb->s_blocksize_bits; + num_sectors = ordered_sum->len >> + inode->i_sb->s_blocksize_bits; + num_sectors = min_t(int, len - index, num_sectors - i); + memcpy(sum + index, ordered_sum->sums + i, + num_sectors); + + index += (int)num_sectors; + if (index == len) + goto out; + disk_bytenr += num_sectors * sectorsize; } } out: spin_unlock_irq(&tree->lock); btrfs_put_ordered_extent(ordered); - return ret; + return index; } @@ -1049,12 +1095,12 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, if (last_mod < root->fs_info->last_trans_committed) return; - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) { list_add_tail(&BTRFS_I(inode)->ordered_operations, &cur_trans->ordered_operations); } - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); } int __init ordered_data_init(void) diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 8eadfe406cdd..68844d59ee6f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree { struct rb_node *last; }; -/* - * these are used to collect checksums done just before bios submission. - * They are attached via a list into the ordered extent, and - * checksum items are inserted into the tree after all the blocks in - * the ordered extent are on disk - */ -struct btrfs_sector_sum { - /* bytenr on disk */ - u64 bytenr; - u32 sum; -}; - struct btrfs_ordered_sum { /* bytenr is the start of this extent on disk */ u64 bytenr; @@ -45,10 +33,10 @@ struct btrfs_ordered_sum { /* * this is the length in bytes covered by the sums array below. */ - unsigned long len; + int len; struct list_head list; - /* last field is a variable length array of btrfs_sector_sums */ - struct btrfs_sector_sum sums[]; + /* last field is a variable length array of csums */ + u32 sums[]; }; /* @@ -149,11 +137,8 @@ struct btrfs_ordered_extent { static inline int btrfs_ordered_sum_size(struct btrfs_root *root, unsigned long bytes) { - unsigned long num_sectors = (bytes + root->sectorsize - 1) / - root->sectorsize; - num_sectors++; - return sizeof(struct btrfs_ordered_sum) + - num_sectors * sizeof(struct btrfs_sector_sum); + int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize); + return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32); } static inline void @@ -196,13 +181,16 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, u64 len); int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, + u32 *sum, int len); int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, struct btrfs_root *root, int wait); void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); +void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info, + int delay_iput); void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 920957ecb27e..dc0024f17c1f 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -176,7 +176,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) nr = btrfs_header_nritems(l); - printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", + btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d", (unsigned long long)btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l)); for (i = 0 ; i < nr ; i++) { @@ -319,10 +319,9 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) btrfs_print_leaf(root, c); return; } - printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", - (unsigned long long)btrfs_header_bytenr(c), - level, nr, - (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); + btrfs_info(root->fs_info, "node %llu level %d total ptrs %d free spc %u", + (unsigned long long)btrfs_header_bytenr(c), + level, nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n", diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index da75efe534d5..7faddfacc5bd 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -19,5 +19,5 @@ #ifndef __PRINT_TREE_ #define __PRINT_TREE_ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l); -void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t); +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b44124dd2370..1280eff8af56 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -31,13 +31,13 @@ #include "locking.h" #include "ulist.h" #include "backref.h" +#include "extent_io.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? * - reorganize keys * - compressed * - sync - * - rescan * - copy also limits on subvol creation * - limit * - caches fuer ulists @@ -98,7 +98,12 @@ struct btrfs_qgroup_list { struct btrfs_qgroup *member; }; -/* must be called with qgroup_lock held */ +static int +qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, + int init_flags); +static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); + +/* must be called with qgroup_ioctl_lock held */ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) { @@ -247,10 +252,17 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) int slot; int ret = 0; u64 flags = 0; + u64 rescan_progress = 0; if (!fs_info->quota_enabled) return 0; + fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS); + if (!fs_info->qgroup_ulist) { + ret = -ENOMEM; + goto out; + } + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -298,7 +310,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) } fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); - /* FIXME read scan element */ + rescan_progress = btrfs_qgroup_status_rescan(l, ptr); goto next1; } @@ -400,9 +412,18 @@ out: if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; + } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && + ret >= 0) { + ret = qgroup_rescan_init(fs_info, rescan_progress, 0); } btrfs_free_path(path); + if (ret < 0) { + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + } + return ret < 0 ? ret : 0; } @@ -420,8 +441,6 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); - WARN_ON(!list_empty(&qgroup->dirty)); - while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, struct btrfs_qgroup_list, @@ -441,6 +460,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) } kfree(qgroup); } + ulist_free(fs_info->qgroup_ulist); } static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, @@ -721,7 +741,8 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags); btrfs_set_qgroup_status_generation(l, ptr, trans->transid); - /* XXX scan */ + btrfs_set_qgroup_status_rescan(l, ptr, + fs_info->qgroup_rescan_progress.objectid); btrfs_mark_buffer_dirty(l); @@ -783,19 +804,27 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *quota_root; + struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_path *path = NULL; struct btrfs_qgroup_status_item *ptr; struct extent_buffer *leaf; struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_qgroup *qgroup = NULL; int ret = 0; + int slot; - spin_lock(&fs_info->qgroup_lock); + mutex_lock(&fs_info->qgroup_ioctl_lock); if (fs_info->quota_root) { fs_info->pending_quota_state = 1; - spin_unlock(&fs_info->qgroup_lock); goto out; } - spin_unlock(&fs_info->qgroup_lock); + + fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS); + if (!fs_info->qgroup_ulist) { + ret = -ENOMEM; + goto out; + } /* * initially create the quota tree @@ -830,10 +859,57 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags); - btrfs_set_qgroup_status_scan(leaf, ptr, 0); + btrfs_set_qgroup_status_rescan(leaf, ptr, 0); btrfs_mark_buffer_dirty(leaf); + key.objectid = 0; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = 0; + + btrfs_release_path(path); + ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); + if (ret > 0) + goto out_add_root; + if (ret < 0) + goto out_free_path; + + + while (1) { + slot = path->slots[0]; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.type == BTRFS_ROOT_REF_KEY) { + ret = add_qgroup_item(trans, quota_root, + found_key.offset); + if (ret) + goto out_free_path; + + qgroup = add_qgroup_rb(fs_info, found_key.offset); + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + goto out_free_path; + } + } + ret = btrfs_next_item(tree_root, path); + if (ret < 0) + goto out_free_path; + if (ret) + break; + } + +out_add_root: + btrfs_release_path(path); + ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); + if (ret) + goto out_free_path; + + qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + goto out_free_path; + } spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; fs_info->pending_quota_state = 1; @@ -847,6 +923,11 @@ out_free_root: kfree(quota_root); } out: + if (ret) { + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; + } + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } @@ -857,11 +938,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, struct btrfs_root *quota_root; int ret = 0; + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_root) + goto out; spin_lock(&fs_info->qgroup_lock); - if (!fs_info->quota_root) { - spin_unlock(&fs_info->qgroup_lock); - return 0; - } fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; quota_root = fs_info->quota_root; @@ -869,8 +949,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, btrfs_free_qgroup_config(fs_info); spin_unlock(&fs_info->qgroup_lock); - if (!quota_root) - return -EINVAL; + if (!quota_root) { + ret = -EINVAL; + goto out; + } ret = btrfs_clean_quota_tree(trans, quota_root); if (ret) @@ -891,39 +973,62 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, free_extent_buffer(quota_root->commit_root); kfree(quota_root); out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } -int btrfs_quota_rescan(struct btrfs_fs_info *fs_info) +static void qgroup_dirty(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) { - /* FIXME */ - return 0; + if (list_empty(&qgroup->dirty)) + list_add(&qgroup->dirty, &fs_info->dirty_qgroups); } int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst) { struct btrfs_root *quota_root; + struct btrfs_qgroup *parent; + struct btrfs_qgroup *member; + struct btrfs_qgroup_list *list; int ret = 0; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; - if (!quota_root) - return -EINVAL; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + member = find_qgroup_rb(fs_info, src); + parent = find_qgroup_rb(fs_info, dst); + if (!member || !parent) { + ret = -EINVAL; + goto out; + } + + /* check if such qgroup relation exist firstly */ + list_for_each_entry(list, &member->groups, next_group) { + if (list->group == parent) { + ret = -EEXIST; + goto out; + } + } ret = add_qgroup_relation_item(trans, quota_root, src, dst); if (ret) - return ret; + goto out; ret = add_qgroup_relation_item(trans, quota_root, dst, src); if (ret) { del_qgroup_relation_item(trans, quota_root, src, dst); - return ret; + goto out; } spin_lock(&fs_info->qgroup_lock); ret = add_relation_rb(quota_root->fs_info, src, dst); spin_unlock(&fs_info->qgroup_lock); - +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } @@ -931,13 +1036,34 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst) { struct btrfs_root *quota_root; + struct btrfs_qgroup *parent; + struct btrfs_qgroup *member; + struct btrfs_qgroup_list *list; int ret = 0; int err; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; - if (!quota_root) - return -EINVAL; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + member = find_qgroup_rb(fs_info, src); + parent = find_qgroup_rb(fs_info, dst); + if (!member || !parent) { + ret = -EINVAL; + goto out; + } + + /* check if such qgroup relation exist firstly */ + list_for_each_entry(list, &member->groups, next_group) { + if (list->group == parent) + goto exist; + } + ret = -ENOENT; + goto out; +exist: ret = del_qgroup_relation_item(trans, quota_root, src, dst); err = del_qgroup_relation_item(trans, quota_root, dst, src); if (err && !ret) @@ -945,9 +1071,9 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, spin_lock(&fs_info->qgroup_lock); del_relation_rb(fs_info, src, dst); - spin_unlock(&fs_info->qgroup_lock); - +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } @@ -958,11 +1084,21 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, struct btrfs_qgroup *qgroup; int ret = 0; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; - if (!quota_root) - return -EINVAL; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (qgroup) { + ret = -EEXIST; + goto out; + } ret = add_qgroup_item(trans, quota_root, qgroupid); + if (ret) + goto out; spin_lock(&fs_info->qgroup_lock); qgroup = add_qgroup_rb(fs_info, qgroupid); @@ -970,7 +1106,8 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, if (IS_ERR(qgroup)) ret = PTR_ERR(qgroup); - +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } @@ -981,27 +1118,32 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, struct btrfs_qgroup *qgroup; int ret = 0; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; - if (!quota_root) - return -EINVAL; + if (!quota_root) { + ret = -EINVAL; + goto out; + } - /* check if there are no relations to this qgroup */ - spin_lock(&fs_info->qgroup_lock); qgroup = find_qgroup_rb(fs_info, qgroupid); - if (qgroup) { - if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) { - spin_unlock(&fs_info->qgroup_lock); - return -EBUSY; + if (!qgroup) { + ret = -ENOENT; + goto out; + } else { + /* check if there are no relations to this qgroup */ + if (!list_empty(&qgroup->groups) || + !list_empty(&qgroup->members)) { + ret = -EBUSY; + goto out; } } - spin_unlock(&fs_info->qgroup_lock); - ret = del_qgroup_item(trans, quota_root, qgroupid); spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(quota_root->fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); - +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } @@ -1009,13 +1151,22 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 qgroupid, struct btrfs_qgroup_limit *limit) { - struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; int ret = 0; - if (!quota_root) - return -EINVAL; + mutex_lock(&fs_info->qgroup_ioctl_lock); + quota_root = fs_info->quota_root; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) { + ret = -ENOENT; + goto out; + } ret = update_qgroup_limit_item(trans, quota_root, qgroupid, limit->flags, limit->max_rfer, limit->max_excl, limit->rsv_rfer, @@ -1027,31 +1178,17 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, } spin_lock(&fs_info->qgroup_lock); - - qgroup = find_qgroup_rb(fs_info, qgroupid); - if (!qgroup) { - ret = -ENOENT; - goto unlock; - } qgroup->lim_flags = limit->flags; qgroup->max_rfer = limit->max_rfer; qgroup->max_excl = limit->max_excl; qgroup->rsv_rfer = limit->rsv_rfer; qgroup->rsv_excl = limit->rsv_excl; - -unlock: spin_unlock(&fs_info->qgroup_lock); - +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } -static void qgroup_dirty(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup *qgroup) -{ - if (list_empty(&qgroup->dirty)) - list_add(&qgroup->dirty, &fs_info->dirty_qgroups); -} - /* * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts * the modification into a list that's later used by btrfs_end_transaction to @@ -1075,6 +1212,144 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, return 0; } +static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct ulist *tmp, + u64 seq) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct ulist_node *tmp_unode; + struct ulist_iterator tmp_uiter; + struct btrfs_qgroup *qg; + int ret; + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + qg = find_qgroup_rb(fs_info, unode->val); + if (!qg) + continue; + + ulist_reinit(tmp); + /* XXX id not needed */ + ret = ulist_add(tmp, qg->qgroupid, + (u64)(uintptr_t)qg, GFP_ATOMIC); + if (ret < 0) + return ret; + ULIST_ITER_INIT(&tmp_uiter); + while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; + if (qg->refcnt < seq) + qg->refcnt = seq + 1; + else + ++qg->refcnt; + + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + (u64)(uintptr_t)glist->group, + GFP_ATOMIC); + if (ret < 0) + return ret; + } + } + } + + return 0; +} + +static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct ulist *tmp, + u64 seq, int sgn, u64 num_bytes, + struct btrfs_qgroup *qgroup) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + int ret; + + ulist_reinit(tmp); + ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); + if (ret < 0) + return ret; + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + if (qg->refcnt < seq) { + /* not visited by step 1 */ + qg->rfer += sgn * num_bytes; + qg->rfer_cmpr += sgn * num_bytes; + if (roots->nnodes == 0) { + qg->excl += sgn * num_bytes; + qg->excl_cmpr += sgn * num_bytes; + } + qgroup_dirty(fs_info, qg); + } + WARN_ON(qg->tag >= seq); + qg->tag = seq; + + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + (uintptr_t)glist->group, GFP_ATOMIC); + if (ret < 0) + return ret; + } + } + + return 0; +} + +static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct ulist *tmp, + u64 seq, int sgn, u64 num_bytes) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + struct ulist_node *tmp_unode; + struct ulist_iterator tmp_uiter; + int ret; + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + qg = find_qgroup_rb(fs_info, unode->val); + if (!qg) + continue; + + ulist_reinit(tmp); + ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC); + if (ret < 0) + return ret; + + ULIST_ITER_INIT(&tmp_uiter); + while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; + if (qg->tag == seq) + continue; + + if (qg->refcnt - seq == roots->nnodes) { + qg->excl -= sgn * num_bytes; + qg->excl_cmpr -= sgn * num_bytes; + qgroup_dirty(fs_info, qg); + } + + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + (uintptr_t)glist->group, + GFP_ATOMIC); + if (ret < 0) + return ret; + } + } + } + + return 0; +} + /* * btrfs_qgroup_account_ref is called for every ref that is added to or deleted * from the fs. First, all roots referencing the extent are searched, and @@ -1090,10 +1365,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, struct btrfs_root *quota_root; u64 ref_root; struct btrfs_qgroup *qgroup; - struct ulist_node *unode; struct ulist *roots = NULL; - struct ulist *tmp = NULL; - struct ulist_iterator uiter; u64 seq; int ret = 0; int sgn; @@ -1132,9 +1404,11 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, case BTRFS_ADD_DELAYED_REF: case BTRFS_ADD_DELAYED_EXTENT: sgn = 1; + seq = btrfs_tree_mod_seq_prev(node->seq); break; case BTRFS_DROP_DELAYED_REF: sgn = -1; + seq = node->seq; break; case BTRFS_UPDATE_DELAYED_HEAD: return 0; @@ -1142,20 +1416,30 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, BUG(); } + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + return 0; + } + } + mutex_unlock(&fs_info->qgroup_rescan_lock); + /* * the delayed ref sequence number we pass depends on the direction of - * the operation. for add operations, we pass (node->seq - 1) to skip + * the operation. for add operations, we pass + * tree_mod_log_prev_seq(node->seq) to skip * the delayed ref's current sequence number, because we need the state * of the tree before the add operation. for delete operations, we pass * (node->seq) to include the delayed ref's current sequence number, * because we need the state of the tree after the delete operation. */ - ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, - sgn > 0 ? node->seq - 1 : node->seq, &roots); + ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots); if (ret < 0) return ret; spin_lock(&fs_info->qgroup_lock); + quota_root = fs_info->quota_root; if (!quota_root) goto unlock; @@ -1167,116 +1451,34 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, /* * step 1: for each old ref, visit all nodes once and inc refcnt */ - tmp = ulist_alloc(GFP_ATOMIC); - if (!tmp) { - ret = -ENOMEM; - goto unlock; - } + ulist_reinit(fs_info->qgroup_ulist); seq = fs_info->qgroup_seq; fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - struct ulist_node *tmp_unode; - struct ulist_iterator tmp_uiter; - struct btrfs_qgroup *qg; - - qg = find_qgroup_rb(fs_info, unode->val); - if (!qg) - continue; - - ulist_reinit(tmp); - /* XXX id not needed */ - ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC); - ULIST_ITER_INIT(&tmp_uiter); - while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { - struct btrfs_qgroup_list *glist; - - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; - if (qg->refcnt < seq) - qg->refcnt = seq + 1; - else - ++qg->refcnt; - - list_for_each_entry(glist, &qg->groups, next_group) { - ulist_add(tmp, glist->group->qgroupid, - (u64)(uintptr_t)glist->group, - GFP_ATOMIC); - } - } - } + ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist, + seq); + if (ret) + goto unlock; /* * step 2: walk from the new root */ - ulist_reinit(tmp); - ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup *qg; - struct btrfs_qgroup_list *glist; - - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; - if (qg->refcnt < seq) { - /* not visited by step 1 */ - qg->rfer += sgn * node->num_bytes; - qg->rfer_cmpr += sgn * node->num_bytes; - if (roots->nnodes == 0) { - qg->excl += sgn * node->num_bytes; - qg->excl_cmpr += sgn * node->num_bytes; - } - qgroup_dirty(fs_info, qg); - } - WARN_ON(qg->tag >= seq); - qg->tag = seq; - - list_for_each_entry(glist, &qg->groups, next_group) { - ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, GFP_ATOMIC); - } - } + ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist, + seq, sgn, node->num_bytes, qgroup); + if (ret) + goto unlock; /* * step 3: walk again from old refs */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - struct btrfs_qgroup *qg; - struct ulist_node *tmp_unode; - struct ulist_iterator tmp_uiter; - - qg = find_qgroup_rb(fs_info, unode->val); - if (!qg) - continue; - - ulist_reinit(tmp); - ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC); - ULIST_ITER_INIT(&tmp_uiter); - while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { - struct btrfs_qgroup_list *glist; - - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; - if (qg->tag == seq) - continue; - - if (qg->refcnt - seq == roots->nnodes) { - qg->excl -= sgn * node->num_bytes; - qg->excl_cmpr -= sgn * node->num_bytes; - qgroup_dirty(fs_info, qg); - } + ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist, + seq, sgn, node->num_bytes); + if (ret) + goto unlock; - list_for_each_entry(glist, &qg->groups, next_group) { - ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, - GFP_ATOMIC); - } - } - } - ret = 0; unlock: spin_unlock(&fs_info->qgroup_lock); ulist_free(roots); - ulist_free(tmp); return ret; } @@ -1289,10 +1491,14 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, { struct btrfs_root *quota_root = fs_info->quota_root; int ret = 0; + int start_rescan_worker = 0; if (!quota_root) goto out; + if (!fs_info->quota_enabled && fs_info->pending_quota_state) + start_rescan_worker = 1; + fs_info->quota_enabled = fs_info->pending_quota_state; spin_lock(&fs_info->qgroup_lock); @@ -1318,6 +1524,16 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, if (ret) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + if (!ret && start_rescan_worker) { + ret = qgroup_rescan_init(fs_info, 0, 1); + if (!ret) { + qgroup_rescan_zero_tracking(fs_info); + btrfs_queue_worker(&fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); + } + ret = 0; + } + out: return ret; @@ -1338,12 +1554,30 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; u32 level_size = 0; + u64 nums; + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_enabled) - return 0; + goto out; - if (!quota_root) - return -EINVAL; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + + if (inherit) { + i_qgroups = (u64 *)(inherit + 1); + nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + + 2 * inherit->num_excl_copies; + for (i = 0; i < nums; ++i) { + srcgroup = find_qgroup_rb(fs_info, *i_qgroups); + if (!srcgroup) { + ret = -EINVAL; + goto out; + } + ++i_qgroups; + } + } /* * create a tracking group for the subvol itself @@ -1470,6 +1704,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, unlock: spin_unlock(&fs_info->qgroup_lock); out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } @@ -1486,7 +1721,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; int ret = 0; - struct ulist *ulist = NULL; struct ulist_node *unode; struct ulist_iterator uiter; @@ -1509,44 +1743,46 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) * in a first step, we check all affected qgroups if any limits would * be exceeded */ - ulist = ulist_alloc(GFP_ATOMIC); - if (!ulist) { - ret = -ENOMEM; + ulist_reinit(fs_info->qgroup_ulist); + ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, + (uintptr_t)qgroup, GFP_ATOMIC); + if (ret < 0) goto out; - } - ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(ulist, &uiter))) { + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && - qg->reserved + qg->rfer + num_bytes > + qg->reserved + (s64)qg->rfer + num_bytes > qg->max_rfer) { ret = -EDQUOT; goto out; } if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && - qg->reserved + qg->excl + num_bytes > + qg->reserved + (s64)qg->excl + num_bytes > qg->max_excl) { ret = -EDQUOT; goto out; } list_for_each_entry(glist, &qg->groups, next_group) { - ulist_add(ulist, glist->group->qgroupid, - (uintptr_t)glist->group, GFP_ATOMIC); + ret = ulist_add(fs_info->qgroup_ulist, + glist->group->qgroupid, + (uintptr_t)glist->group, GFP_ATOMIC); + if (ret < 0) + goto out; } } - + ret = 0; /* * no limits exceeded, now record the reservation into all qgroups */ ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(ulist, &uiter))) { + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; @@ -1556,8 +1792,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) out: spin_unlock(&fs_info->qgroup_lock); - ulist_free(ulist); - return ret; } @@ -1566,10 +1800,10 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_fs_info *fs_info = root->fs_info; - struct ulist *ulist = NULL; struct ulist_node *unode; struct ulist_iterator uiter; u64 ref_root = root->root_key.objectid; + int ret = 0; if (!is_fstree(ref_root)) return; @@ -1587,14 +1821,13 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) if (!qgroup) goto out; - ulist = ulist_alloc(GFP_ATOMIC); - if (!ulist) { - btrfs_std_error(fs_info, -ENOMEM); + ulist_reinit(fs_info->qgroup_ulist); + ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, + (uintptr_t)qgroup, GFP_ATOMIC); + if (ret < 0) goto out; - } - ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(ulist, &uiter))) { + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; @@ -1603,22 +1836,370 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) qg->reserved -= num_bytes; list_for_each_entry(glist, &qg->groups, next_group) { - ulist_add(ulist, glist->group->qgroupid, - (uintptr_t)glist->group, GFP_ATOMIC); + ret = ulist_add(fs_info->qgroup_ulist, + glist->group->qgroupid, + (uintptr_t)glist->group, GFP_ATOMIC); + if (ret < 0) + goto out; } } out: spin_unlock(&fs_info->qgroup_lock); - ulist_free(ulist); } void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) { if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) return; - printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n", + pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n", trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", - trans->delayed_ref_elem.seq); + (u32)(trans->delayed_ref_elem.seq >> 32), + (u32)trans->delayed_ref_elem.seq); BUG(); } + +/* + * returns < 0 on error, 0 when more leafs are to be scanned. + * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared. + */ +static int +qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, + struct btrfs_trans_handle *trans, struct ulist *tmp, + struct extent_buffer *scratch_leaf) +{ + struct btrfs_key found; + struct ulist *roots = NULL; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct seq_list tree_mod_seq_elem = {}; + u64 seq; + int slot; + int ret; + + path->leave_spinning = 1; + mutex_lock(&fs_info->qgroup_rescan_lock); + ret = btrfs_search_slot_for_read(fs_info->extent_root, + &fs_info->qgroup_rescan_progress, + path, 1, 0); + + pr_debug("current progress key (%llu %u %llu), search_slot ret %d\n", + (unsigned long long)fs_info->qgroup_rescan_progress.objectid, + fs_info->qgroup_rescan_progress.type, + (unsigned long long)fs_info->qgroup_rescan_progress.offset, + ret); + + if (ret) { + /* + * The rescan is about to end, we will not be scanning any + * further blocks. We cannot unset the RESCAN flag here, because + * we want to commit the transaction if everything went well. + * To make the live accounting work in this phase, we set our + * scan progress pointer such that every real extent objectid + * will be smaller. + */ + fs_info->qgroup_rescan_progress.objectid = (u64)-1; + btrfs_release_path(path); + mutex_unlock(&fs_info->qgroup_rescan_lock); + return ret; + } + + btrfs_item_key_to_cpu(path->nodes[0], &found, + btrfs_header_nritems(path->nodes[0]) - 1); + fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; + + btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); + memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf)); + slot = path->slots[0]; + btrfs_release_path(path); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { + btrfs_item_key_to_cpu(scratch_leaf, &found, slot); + if (found.type != BTRFS_EXTENT_ITEM_KEY) + continue; + ret = btrfs_find_all_roots(trans, fs_info, found.objectid, + tree_mod_seq_elem.seq, &roots); + if (ret < 0) + goto out; + spin_lock(&fs_info->qgroup_lock); + seq = fs_info->qgroup_seq; + fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ + + ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); + if (ret) { + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + goto out; + } + + /* + * step2 of btrfs_qgroup_account_ref works from a single root, + * we're doing all at once here. + */ + ulist_reinit(tmp); + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + struct btrfs_qgroup *qg; + + qg = find_qgroup_rb(fs_info, unode->val); + if (!qg) + continue; + + ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, + GFP_ATOMIC); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + goto out; + } + } + + /* this loop is similar to step 2 of btrfs_qgroup_account_ref */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux; + qg->rfer += found.offset; + qg->rfer_cmpr += found.offset; + WARN_ON(qg->tag >= seq); + if (qg->refcnt - seq == roots->nnodes) { + qg->excl += found.offset; + qg->excl_cmpr += found.offset; + } + qgroup_dirty(fs_info, qg); + + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + (uintptr_t)glist->group, + GFP_ATOMIC); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + goto out; + } + } + } + + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + ret = 0; + } + +out: + btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); + + return ret; +} + +static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, + qgroup_rescan_work); + struct btrfs_path *path; + struct btrfs_trans_handle *trans = NULL; + struct ulist *tmp = NULL; + struct extent_buffer *scratch_leaf = NULL; + int err = -ENOMEM; + + path = btrfs_alloc_path(); + if (!path) + goto out; + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + goto out; + scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); + if (!scratch_leaf) + goto out; + + err = 0; + while (!err) { + trans = btrfs_start_transaction(fs_info->fs_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + break; + } + if (!fs_info->quota_enabled) { + err = -EINTR; + } else { + err = qgroup_rescan_leaf(fs_info, path, trans, + tmp, scratch_leaf); + } + if (err > 0) + btrfs_commit_transaction(trans, fs_info->fs_root); + else + btrfs_end_transaction(trans, fs_info->fs_root); + } + +out: + kfree(scratch_leaf); + ulist_free(tmp); + btrfs_free_path(path); + + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + + if (err == 2 && + fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } else if (err < 0) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (err >= 0) { + pr_info("btrfs: qgroup scan completed%s\n", + err == 2 ? " (inconsistency flag cleared)" : ""); + } else { + pr_err("btrfs: qgroup scan failed with %d\n", err); + } + + complete_all(&fs_info->qgroup_rescan_completion); +} + +/* + * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all + * memory required for the rescan context. + */ +static int +qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, + int init_flags) +{ + int ret = 0; + + if (!init_flags && + (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) || + !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) { + ret = -EINVAL; + goto err; + } + + mutex_lock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); + + if (init_flags) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + ret = -EINPROGRESS; + else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) + ret = -EINVAL; + + if (ret) { + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + goto err; + } + + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; + } + + memset(&fs_info->qgroup_rescan_progress, 0, + sizeof(fs_info->qgroup_rescan_progress)); + fs_info->qgroup_rescan_progress.objectid = progress_objectid; + + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + init_completion(&fs_info->qgroup_rescan_completion); + + memset(&fs_info->qgroup_rescan_work, 0, + sizeof(fs_info->qgroup_rescan_work)); + fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker; + + if (ret) { +err: + pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret); + return ret; + } + + return 0; +} + +static void +qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) +{ + struct rb_node *n; + struct btrfs_qgroup *qgroup; + + spin_lock(&fs_info->qgroup_lock); + /* clear all current qgroup tracking information */ + for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { + qgroup = rb_entry(n, struct btrfs_qgroup, node); + qgroup->rfer = 0; + qgroup->rfer_cmpr = 0; + qgroup->excl = 0; + qgroup->excl_cmpr = 0; + } + spin_unlock(&fs_info->qgroup_lock); +} + +int +btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) +{ + int ret = 0; + struct btrfs_trans_handle *trans; + + ret = qgroup_rescan_init(fs_info, 0, 1); + if (ret) + return ret; + + /* + * We have set the rescan_progress to 0, which means no more + * delayed refs will be accounted by btrfs_qgroup_account_ref. + * However, btrfs_qgroup_account_ref may be right after its call + * to btrfs_find_all_roots, in which case it would still do the + * accounting. + * To solve this, we're committing the transaction, which will + * ensure we run all delayed refs and only after that, we are + * going to clear all tracking information for a clean start. + */ + + trans = btrfs_join_transaction(fs_info->fs_root); + if (IS_ERR(trans)) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, fs_info->fs_root); + if (ret) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return ret; + } + + qgroup_rescan_zero_tracking(fs_info); + + btrfs_queue_worker(&fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); + + return 0; +} + +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info) +{ + int running; + int ret = 0; + + mutex_lock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); + running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN; + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (running) + ret = wait_for_completion_interruptible( + &fs_info->qgroup_rescan_completion); + + return ret; +} + +/* + * this is only called from open_ctree where we're still single threaded, thus + * locking is omitted here. + */ +void +btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) +{ + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + btrfs_queue_worker(&fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); +} diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 9a79fb790adb..0525e1389f5b 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -410,7 +410,7 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) /* * remove everything in the cache */ -void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) +static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) { struct btrfs_stripe_hash_table *table; unsigned long flags; @@ -1010,12 +1010,12 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) * this will try to merge into existing bios if possible, and returns * zero if all went well. */ -int rbio_add_io_page(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list, - struct page *page, - int stripe_nr, - unsigned long page_index, - unsigned long bio_max_len) +static int rbio_add_io_page(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list, + struct page *page, + int stripe_nr, + unsigned long page_index, + unsigned long bio_max_len) { struct bio *last = bio_list->tail; u64 last_end = 0; @@ -1050,7 +1050,7 @@ int rbio_add_io_page(struct btrfs_raid_bio *rbio, } /* put a new bio on the list */ - bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); + bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); if (!bio) return -ENOMEM; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 96b93daa0bbb..1031b69252c5 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -955,10 +955,11 @@ int btrfs_reada_wait(void *handle) while (atomic_read(&rc->elems)) { wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, 5 * HZ); - dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); + dump_devs(rc->root->fs_info, + atomic_read(&rc->elems) < 10 ? 1 : 0); } - dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); + dump_devs(rc->root->fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); kref_put(&rc->refcnt, reada_control_release); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b67171e6d688..12096496cc99 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -326,8 +326,7 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) return NULL; } -void backref_tree_panic(struct rb_node *rb_node, int errno, - u64 bytenr) +static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) { struct btrfs_fs_info *fs_info = NULL; @@ -619,10 +618,13 @@ static noinline_for_stack int find_inline_backref(struct extent_buffer *leaf, int slot, unsigned long *ptr, unsigned long *end) { + struct btrfs_key key; struct btrfs_extent_item *ei; struct btrfs_tree_block_info *bi; u32 item_size; + btrfs_item_key_to_cpu(leaf, &key, slot); + item_size = btrfs_item_size_nr(leaf, slot); #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 if (item_size < sizeof(*ei)) { @@ -634,13 +636,18 @@ int find_inline_backref(struct extent_buffer *leaf, int slot, WARN_ON(!(btrfs_extent_flags(leaf, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK)); - if (item_size <= sizeof(*ei) + sizeof(*bi)) { + if (key.type == BTRFS_EXTENT_ITEM_KEY && + item_size <= sizeof(*ei) + sizeof(*bi)) { WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); return 1; } - bi = (struct btrfs_tree_block_info *)(ei + 1); - *ptr = (unsigned long)(bi + 1); + if (key.type == BTRFS_EXTENT_ITEM_KEY) { + bi = (struct btrfs_tree_block_info *)(ei + 1); + *ptr = (unsigned long)(bi + 1); + } else { + *ptr = (unsigned long)(ei + 1); + } *end = (unsigned long)ei + item_size; return 0; } @@ -708,7 +715,7 @@ again: end = 0; ptr = 0; key.objectid = cur->bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; + key.type = BTRFS_METADATA_ITEM_KEY; key.offset = (u64)-1; path1->search_commit_root = 1; @@ -766,7 +773,8 @@ again: break; } - if (key.type == BTRFS_EXTENT_ITEM_KEY) { + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { ret = find_inline_backref(eb, path1->slots[0], &ptr, &end); if (ret) @@ -1297,6 +1305,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, struct extent_buffer *eb; struct btrfs_root_item *root_item; struct btrfs_key root_key; + u64 last_snap = 0; int ret; root_item = kmalloc(sizeof(*root_item), GFP_NOFS); @@ -1312,6 +1321,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, BTRFS_TREE_RELOC_OBJECTID); BUG_ON(ret); + last_snap = btrfs_root_last_snapshot(&root->root_item); btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1); } else { @@ -1337,6 +1347,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); root_item->drop_level = 0; + /* + * abuse rtransid, it is safe because it is impossible to + * receive data into a relocation tree. + */ + btrfs_set_root_rtransid(root_item, last_snap); + btrfs_set_root_otransid(root_item, trans->transid); } btrfs_tree_unlock(eb); @@ -1347,8 +1363,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, BUG_ON(ret); kfree(root_item); - reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, - &root_key); + reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key); BUG_ON(IS_ERR(reloc_root)); reloc_root->last_trans = trans->transid; return reloc_root; @@ -1762,7 +1777,11 @@ again: eb = read_tree_block(dest, old_bytenr, blocksize, old_ptr_gen); - BUG_ON(!eb); + if (!eb || !extent_buffer_uptodate(eb)) { + ret = (!eb) ? -ENOMEM : -EIO; + free_extent_buffer(eb); + break; + } btrfs_tree_lock(eb); if (cow) { ret = btrfs_cow_block(trans, dest, eb, parent, @@ -1915,6 +1934,10 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, bytenr = btrfs_node_blockptr(eb, path->slots[i]); blocksize = btrfs_level_size(root, i - 1); eb = read_tree_block(root, bytenr, blocksize, ptr_gen); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } BUG_ON(btrfs_header_level(eb) != i - 1); path->nodes[i - 1] = eb; path->slots[i - 1] = 0; @@ -2257,8 +2280,12 @@ void free_reloc_roots(struct list_head *list) static noinline_for_stack int merge_reloc_roots(struct reloc_control *rc) { + struct btrfs_trans_handle *trans; struct btrfs_root *root; struct btrfs_root *reloc_root; + u64 last_snap; + u64 otransid; + u64 objectid; LIST_HEAD(reloc_roots); int found = 0; int ret = 0; @@ -2292,12 +2319,44 @@ again: } else { list_del_init(&reloc_root->root_list); } + + /* + * we keep the old last snapshod transid in rtranid when we + * created the relocation tree. + */ + last_snap = btrfs_root_rtransid(&reloc_root->root_item); + otransid = btrfs_root_otransid(&reloc_root->root_item); + objectid = reloc_root->root_key.offset; + ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); if (ret < 0) { if (list_empty(&reloc_root->root_list)) list_add_tail(&reloc_root->root_list, &reloc_roots); goto out; + } else if (!ret) { + /* + * recover the last snapshot tranid to avoid + * the space balance break NOCOW. + */ + root = read_fs_root(rc->extent_root->fs_info, + objectid); + if (IS_ERR(root)) + continue; + + if (btrfs_root_refs(&root->root_item) == 0) + continue; + + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); + + /* Check if the fs/file tree was snapshoted or not. */ + if (btrfs_root_last_snapshot(&root->root_item) == + otransid - 1) + btrfs_set_root_last_snapshot(&root->root_item, + last_snap); + + btrfs_end_transaction(trans, root); } } @@ -2592,7 +2651,8 @@ static int do_relocation(struct btrfs_trans_handle *trans, blocksize = btrfs_level_size(root, node->level); generation = btrfs_node_ptr_generation(upper->eb, slot); eb = read_tree_block(root, bytenr, blocksize, generation); - if (!eb) { + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); err = -EIO; goto next; } @@ -2753,7 +2813,10 @@ static int get_tree_block_key(struct reloc_control *rc, BUG_ON(block->key_ready); eb = read_tree_block(rc->extent_root, block->bytenr, block->key.objectid, block->key.offset); - BUG_ON(!eb); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } WARN_ON(btrfs_header_level(eb) != block->level); if (block->level == 0) btrfs_item_key_to_cpu(eb, &block->key, 0); @@ -2768,8 +2831,13 @@ static int reada_tree_block(struct reloc_control *rc, struct tree_block *block) { BUG_ON(block->key_ready); - readahead_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, block->key.offset); + if (block->key.type == BTRFS_METADATA_ITEM_KEY) + readahead_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, + rc->extent_root->leafsize); + else + readahead_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, block->key.offset); return 0; } @@ -2850,7 +2918,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; - goto out_path; + goto out_free_blocks; } rb_node = rb_first(blocks); @@ -2864,8 +2932,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, rb_node = rb_first(blocks); while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); - if (!block->key_ready) - get_tree_block_key(rc, block); + if (!block->key_ready) { + err = get_tree_block_key(rc, block); + if (err) + goto out_free_path; + } rb_node = rb_next(rb_node); } @@ -2892,8 +2963,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, out: err = finish_pending_nodes(trans, rc, path, err); +out_free_path: btrfs_free_path(path); -out_path: +out_free_blocks: free_block_list(blocks); return err; } @@ -2965,7 +3037,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, lock_extent(&BTRFS_I(inode)->io_tree, start, end); while (1) { write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); + ret = add_extent_mapping(em_tree, em, 0); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -3176,12 +3248,17 @@ static int add_tree_block(struct reloc_control *rc, eb = path->nodes[0]; item_size = btrfs_item_size_nr(eb, path->slots[0]); - if (item_size >= sizeof(*ei) + sizeof(*bi)) { + if (extent_key->type == BTRFS_METADATA_ITEM_KEY || + item_size >= sizeof(*ei) + sizeof(*bi)) { ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - bi = (struct btrfs_tree_block_info *)(ei + 1); + if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { + bi = (struct btrfs_tree_block_info *)(ei + 1); + level = btrfs_tree_block_level(eb, bi); + } else { + level = (int)extent_key->offset; + } generation = btrfs_extent_generation(eb, ei); - level = btrfs_tree_block_level(eb, bi); } else { #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 u64 ref_owner; @@ -3210,7 +3287,7 @@ static int add_tree_block(struct reloc_control *rc, return -ENOMEM; block->bytenr = extent_key->objectid; - block->key.objectid = extent_key->offset; + block->key.objectid = rc->extent_root->leafsize; block->key.offset = generation; block->level = level; block->key_ready = 0; @@ -3232,6 +3309,8 @@ static int __add_tree_block(struct reloc_control *rc, struct btrfs_path *path; struct btrfs_key key; int ret; + bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info, + SKINNY_METADATA); if (tree_block_processed(bytenr, blocksize, rc)) return 0; @@ -3242,19 +3321,42 @@ static int __add_tree_block(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - +again: key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = blocksize; + if (skinny) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = (u64)-1; + } else { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = blocksize; + } path->search_commit_root = 1; path->skip_locking = 1; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) goto out; + + if (ret > 0 && skinny) { + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + (key.type == BTRFS_METADATA_ITEM_KEY || + (key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == blocksize))) + ret = 0; + } + + if (ret) { + skinny = false; + btrfs_release_path(path); + goto again; + } + } BUG_ON(ret); - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ret = add_tree_block(rc, &key, path, blocks); out: btrfs_free_path(path); @@ -3275,7 +3377,8 @@ static int block_use_full_backref(struct reloc_control *rc, return 1; ret = btrfs_lookup_extent_info(NULL, rc->extent_root, - eb->start, eb->len, NULL, &flags); + eb->start, btrfs_header_level(eb), 1, + NULL, &flags); BUG_ON(ret); if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) @@ -3309,6 +3412,11 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, } truncate: + ret = btrfs_check_trunc_cache_free_space(root, + &fs_info->global_block_rsv); + if (ret) + goto out; + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -3644,12 +3752,25 @@ next: break; } - if (key.type != BTRFS_EXTENT_ITEM_KEY || + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) { + path->slots[0]++; + goto next; + } + + if (key.type == BTRFS_EXTENT_ITEM_KEY && key.objectid + key.offset <= rc->search_start) { path->slots[0]++; goto next; } + if (key.type == BTRFS_METADATA_ITEM_KEY && + key.objectid + rc->extent_root->leafsize <= + rc->search_start) { + path->slots[0]++; + goto next; + } + ret = find_first_extent_bit(&rc->processed_blocks, key.objectid, &start, &end, EXTENT_DIRTY, NULL); @@ -3658,7 +3779,11 @@ next: btrfs_release_path(path); rc->search_start = end + 1; } else { - rc->search_start = key.objectid + key.offset; + if (key.type == BTRFS_EXTENT_ITEM_KEY) + rc->search_start = key.objectid + key.offset; + else + rc->search_start = key.objectid + + rc->extent_root->leafsize; memcpy(extent_key, &key, sizeof(key)); return 0; } @@ -4019,7 +4144,7 @@ out: return inode; } -static struct reloc_control *alloc_reloc_control(void) +static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) { struct reloc_control *rc; @@ -4030,7 +4155,8 @@ static struct reloc_control *alloc_reloc_control(void) INIT_LIST_HEAD(&rc->reloc_roots); backref_cache_init(&rc->backref_cache); mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, NULL); + extent_io_tree_init(&rc->processed_blocks, + fs_info->btree_inode->i_mapping); return rc; } @@ -4047,7 +4173,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) int rw = 0; int err = 0; - rc = alloc_reloc_control(); + rc = alloc_reloc_control(fs_info); if (!rc) return -ENOMEM; @@ -4096,19 +4222,16 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) (unsigned long long)rc->block_group->key.objectid, (unsigned long long)rc->block_group->flags); - ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + ret = btrfs_start_all_delalloc_inodes(fs_info, 0); if (ret < 0) { err = ret; goto out; } - btrfs_wait_ordered_extents(fs_info->tree_root, 0); + btrfs_wait_all_ordered_extents(fs_info, 0); while (1) { mutex_lock(&fs_info->cleaner_mutex); - - btrfs_clean_old_snapshots(fs_info->tree_root); ret = relocate_block_group(rc); - mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { err = ret; @@ -4216,7 +4339,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) key.type != BTRFS_ROOT_ITEM_KEY) break; - reloc_root = btrfs_read_fs_root_no_radix(root, &key); + reloc_root = btrfs_read_fs_root(root, &key); if (IS_ERR(reloc_root)) { err = PTR_ERR(reloc_root); goto out; @@ -4251,7 +4374,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (list_empty(&reloc_roots)) goto out; - rc = alloc_reloc_control(); + rc = alloc_reloc_control(root->fs_info); if (!rc) { err = -ENOMEM; goto out; @@ -4335,10 +4458,8 @@ out: int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) { struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_ordered_extent *ordered; struct btrfs_root *root = BTRFS_I(inode)->root; - size_t offset; int ret; u64 disk_bytenr; LIST_HEAD(list); @@ -4352,19 +4473,13 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) if (ret) goto out; + disk_bytenr = ordered->start; while (!list_empty(&list)) { sums = list_entry(list.next, struct btrfs_ordered_sum, list); list_del_init(&sums->list); - sector_sum = sums->sums; - sums->bytenr = ordered->start; - - offset = 0; - while (offset < sums->len) { - sector_sum->bytenr += ordered->start - disk_bytenr; - sector_sum++; - offset += root->sectorsize; - } + sums->bytenr = disk_bytenr; + disk_bytenr += sums->len; btrfs_add_ordered_sum(inode, ordered, sums); } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 668af537a3ea..ffb1036ef10d 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -29,9 +29,8 @@ * generation numbers as then we know the root was once mounted with an older * kernel that was not aware of the root item structure change. */ -void btrfs_read_root_item(struct btrfs_root *root, - struct extent_buffer *eb, int slot, - struct btrfs_root_item *item) +void btrfs_read_root_item(struct extent_buffer *eb, int slot, + struct btrfs_root_item *item) { uuid_le uuid; int len; @@ -65,52 +64,59 @@ void btrfs_read_root_item(struct btrfs_root *root, } /* - * lookup the root with the highest offset for a given objectid. The key we do - * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 - * on error. + * btrfs_find_root - lookup the root by the key. + * root: the root of the root tree + * search_key: the key to search + * path: the path we search + * root_item: the root item of the tree we look for + * root_key: the reak key of the tree we look for + * + * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset + * of the search key, just lookup the root with the highest offset for a + * given objectid. + * + * If we find something return 0, otherwise > 0, < 0 on error. */ -int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, - struct btrfs_root_item *item, struct btrfs_key *key) +int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key) { - struct btrfs_path *path; - struct btrfs_key search_key; struct btrfs_key found_key; struct extent_buffer *l; int ret; int slot; - search_key.objectid = objectid; - search_key.type = BTRFS_ROOT_ITEM_KEY; - search_key.offset = (u64)-1; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0); if (ret < 0) - goto out; + return ret; - BUG_ON(ret == 0); - if (path->slots[0] == 0) { - ret = 1; - goto out; + if (search_key->offset != -1ULL) { /* the search key is exact */ + if (ret > 0) + goto out; + } else { + BUG_ON(ret == 0); /* Logical error */ + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + ret = 0; } + l = path->nodes[0]; - slot = path->slots[0] - 1; + slot = path->slots[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); - if (found_key.objectid != objectid || + if (found_key.objectid != search_key->objectid || found_key.type != BTRFS_ROOT_ITEM_KEY) { ret = 1; goto out; } - if (item) - btrfs_read_root_item(root, l, slot, item); - if (key) - memcpy(key, &found_key, sizeof(found_key)); - ret = 0; + if (root_item) + btrfs_read_root_item(l, slot, root_item); + if (root_key) + memcpy(root_key, &found_key, sizeof(found_key)); out: - btrfs_free_path(path); + btrfs_release_path(path); return ret; } @@ -213,86 +219,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, return btrfs_insert_item(trans, root, key, item, sizeof(*item)); } -/* - * at mount time we want to find all the old transaction snapshots that were in - * the process of being deleted if we crashed. This is any root item with an - * offset lower than the latest root. They need to be queued for deletion to - * finish what was happening when we crashed. - */ -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) -{ - struct btrfs_root *dead_root; - struct btrfs_root_item *ri; - struct btrfs_key key; - struct btrfs_key found_key; - struct btrfs_path *path; - int ret; - u32 nritems; - struct extent_buffer *leaf; - int slot; - - key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - key.offset = 0; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - -again: - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - while (1) { - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - if (slot >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - } - btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) - goto next; - - if (key.objectid < objectid) - goto next; - - if (key.objectid > objectid) - break; - - ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item); - if (btrfs_disk_root_refs(leaf, ri) != 0) - goto next; - - memcpy(&found_key, &key, sizeof(key)); - key.offset++; - btrfs_release_path(path); - dead_root = - btrfs_read_fs_root_no_radix(root->fs_info->tree_root, - &found_key); - if (IS_ERR(dead_root)) { - ret = PTR_ERR(dead_root); - goto err; - } - - ret = btrfs_add_dead_root(dead_root); - if (ret) - goto err; - goto again; -next: - slot++; - path->slots[0]++; - } - ret = 0; -err: - btrfs_free_path(path); - return ret; -} - int btrfs_find_orphan_roots(struct btrfs_root *tree_root) { struct extent_buffer *leaf; @@ -302,6 +228,10 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) struct btrfs_root *root; int err = 0; int ret; + bool can_recover = true; + + if (tree_root->fs_info->sb->s_flags & MS_RDONLY) + can_recover = false; path = btrfs_alloc_path(); if (!path) @@ -341,20 +271,52 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid = key.offset; key.offset++; - root = btrfs_read_fs_root_no_name(tree_root->fs_info, - &root_key); - if (!IS_ERR(root)) + root = btrfs_read_fs_root(tree_root, &root_key); + err = PTR_RET(root); + if (err && err != -ENOENT) { + break; + } else if (err == -ENOENT) { + struct btrfs_trans_handle *trans; + + btrfs_release_path(path); + + trans = btrfs_join_transaction(tree_root); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + btrfs_error(tree_root->fs_info, err, + "Failed to start trans to delete " + "orphan item"); + break; + } + err = btrfs_del_orphan_item(trans, tree_root, + root_key.objectid); + btrfs_end_transaction(trans, tree_root); + if (err) { + btrfs_error(tree_root->fs_info, err, + "Failed to delete root orphan " + "item"); + break; + } continue; + } - ret = PTR_ERR(root); - if (ret != -ENOENT) { - err = ret; + if (btrfs_root_refs(&root->root_item) == 0) { + btrfs_add_dead_root(root); + continue; + } + + err = btrfs_init_fs_root(root); + if (err) { + btrfs_free_fs_root(root); break; } - ret = btrfs_find_dead_roots(tree_root, root_key.objectid); - if (ret) { - err = ret; + root->orphan_item_inserted = 1; + + err = btrfs_insert_fs_root(root->fs_info, root); + if (err) { + BUG_ON(err == -EEXIST); + btrfs_free_fs_root(root); break; } } @@ -369,8 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_path *path; int ret; - struct btrfs_root_item *ri; - struct extent_buffer *leaf; path = btrfs_alloc_path(); if (!path) @@ -380,8 +340,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, goto out; BUG_ON(ret != 0); - leaf = path->nodes[0]; - ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); ret = btrfs_del_item(trans, root, path); out: diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 85e072b956d5..4ba2a69a60ad 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1296,7 +1296,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, } WARN_ON(!page->page); - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) { page->io_error = 1; sblock->no_io_error_seen = 0; @@ -1336,7 +1336,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, int page_num; u8 calculated_csum[BTRFS_CSUM_SIZE]; u32 crc = ~(u32)0; - struct btrfs_root *root = fs_info->extent_root; void *mapped_buffer; WARN_ON(!sblock->pagev[0]->page); @@ -1365,12 +1364,11 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, for (page_num = 0;;) { if (page_num == 0 && is_metadata) - crc = btrfs_csum_data(root, + crc = btrfs_csum_data( ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, crc, PAGE_SIZE - BTRFS_CSUM_SIZE); else - crc = btrfs_csum_data(root, mapped_buffer, crc, - PAGE_SIZE); + crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE); kunmap_atomic(mapped_buffer); page_num++; @@ -1433,7 +1431,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; bio->bi_bdev = page_bad->dev->bdev; @@ -1524,7 +1522,7 @@ again: sbio->dev = wr_ctx->tgtdev; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); + bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); if (!bio) { mutex_unlock(&wr_ctx->wr_lock); return -ENOMEM; @@ -1657,7 +1655,6 @@ static int scrub_checksum_data(struct scrub_block *sblock) void *buffer; u32 crc = ~(u32)0; int fail = 0; - struct btrfs_root *root = sctx->dev_root; u64 len; int index; @@ -1674,7 +1671,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, PAGE_SIZE); - crc = btrfs_csum_data(root, buffer, crc, l); + crc = btrfs_csum_data(buffer, crc, l); kunmap_atomic(buffer); len -= l; if (len == 0) @@ -1744,7 +1741,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, mapped_size); - crc = btrfs_csum_data(root, p, crc, l); + crc = btrfs_csum_data(p, crc, l); kunmap_atomic(mapped_buffer); len -= l; if (len == 0) @@ -1805,7 +1802,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, mapped_size); - crc = btrfs_csum_data(root, p, crc, l); + crc = btrfs_csum_data(p, crc, l); kunmap_atomic(mapped_buffer); len -= l; if (len == 0) @@ -1933,7 +1930,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); + bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); if (!bio) return -ENOMEM; sbio->bio = bio; @@ -2129,8 +2126,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, u8 *csum) { struct btrfs_ordered_sum *sum = NULL; - int ret = 0; - unsigned long i; + unsigned long index; unsigned long num_sectors; while (!list_empty(&sctx->csum_list)) { @@ -2149,19 +2145,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, if (!sum) return 0; + index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize; num_sectors = sum->len / sctx->sectorsize; - for (i = 0; i < num_sectors; ++i) { - if (sum->sums[i].bytenr == logical) { - memcpy(csum, &sum->sums[i].sum, sctx->csum_size); - ret = 1; - break; - } - } - if (ret && i == num_sectors - 1) { + memcpy(csum, sum->sums + index, sctx->csum_size); + if (index == num_sectors - 1) { list_del(&sum->list); kfree(sum); } - return ret; + return 1; } /* scrub extent tries to collect up to 64 kB for each bio */ @@ -2236,12 +2227,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 flags; int ret; int slot; - int i; u64 nstripes; struct extent_buffer *l; struct btrfs_key key; u64 physical; u64 logical; + u64 logic_end; u64 generation; int mirror_num; struct reada_control *reada1; @@ -2255,6 +2246,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 extent_len; struct btrfs_device *extent_dev; int extent_mirror_num; + int stop_loop; if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { @@ -2315,8 +2307,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key_start.type = BTRFS_EXTENT_ITEM_KEY; key_start.offset = (u64)0; key_end.objectid = base + offset + nstripes * increment; - key_end.type = BTRFS_EXTENT_ITEM_KEY; - key_end.offset = (u64)0; + key_end.type = BTRFS_METADATA_ITEM_KEY; + key_end.offset = (u64)-1; reada1 = btrfs_reada_add(root, &key_start, &key_end); key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -2354,8 +2346,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ logical = base + offset; physical = map->stripes[num].physical; + logic_end = logical + increment * nstripes; ret = 0; - for (i = 0; i < nstripes; ++i) { + while (logical < logic_end) { /* * canceled? */ @@ -2391,19 +2384,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, wake_up(&fs_info->scrub_pause_wait); } - ret = btrfs_lookup_csums_range(csum_root, logical, - logical + map->stripe_len - 1, - &sctx->csum_list, 1); - if (ret) - goto out; - key.objectid = logical; key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)0; + key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; + if (ret > 0) { ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY); @@ -2420,7 +2408,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } } + stop_loop = 0; while (1) { + u64 bytes; + l = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(l)) { @@ -2430,19 +2421,30 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (ret < 0) goto out; + stop_loop = 1; break; } btrfs_item_key_to_cpu(l, &key, slot); - if (key.objectid + key.offset <= logical) - goto next; + if (key.type == BTRFS_METADATA_ITEM_KEY) + bytes = root->leafsize; + else + bytes = key.offset; - if (key.objectid >= logical + map->stripe_len) - break; + if (key.objectid + bytes <= logical) + goto next; - if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) goto next; + if (key.objectid >= logical + map->stripe_len) { + /* out of this device extent */ + if (key.objectid >= logic_end) + stop_loop = 1; + break; + } + extent = btrfs_item_ptr(l, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(l, extent); @@ -2458,22 +2460,24 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, goto next; } +again: + extent_logical = key.objectid; + extent_len = bytes; + /* * trim extent to this stripe */ - if (key.objectid < logical) { - key.offset -= logical - key.objectid; - key.objectid = logical; + if (extent_logical < logical) { + extent_len -= logical - extent_logical; + extent_logical = logical; } - if (key.objectid + key.offset > + if (extent_logical + extent_len > logical + map->stripe_len) { - key.offset = logical + map->stripe_len - - key.objectid; + extent_len = logical + map->stripe_len - + extent_logical; } - extent_logical = key.objectid; - extent_physical = key.objectid - logical + physical; - extent_len = key.offset; + extent_physical = extent_logical - logical + physical; extent_dev = scrub_dev; extent_mirror_num = mirror_num; if (is_dev_replace) @@ -2481,13 +2485,36 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, extent_len, &extent_physical, &extent_dev, &extent_mirror_num); + + ret = btrfs_lookup_csums_range(csum_root, logical, + logical + map->stripe_len - 1, + &sctx->csum_list, 1); + if (ret) + goto out; + ret = scrub_extent(sctx, extent_logical, extent_len, extent_physical, extent_dev, flags, generation, extent_mirror_num, - key.objectid - logical + physical); + extent_physical); if (ret) goto out; + scrub_free_csums(sctx); + if (extent_logical + extent_len < + key.objectid + bytes) { + logical += increment; + physical += map->stripe_len; + + if (logical < key.objectid + bytes) { + cond_resched(); + goto again; + } + + if (logical >= logic_end) { + stop_loop = 1; + break; + } + } next: path->slots[0]++; } @@ -2495,8 +2522,14 @@ next: logical += increment; physical += map->stripe_len; spin_lock(&sctx->stat_lock); - sctx->stat.last_physical = physical; + if (stop_loop) + sctx->stat.last_physical = map->stripes[num].physical + + length; + else + sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); + if (stop_loop) + break; } out: /* push queued extents */ @@ -3005,28 +3038,6 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_device *dev; - int ret; - - /* - * we have to hold the device_list_mutex here so the device - * does not go away in cancel_dev. FIXME: find a better solution - */ - mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info, devid, NULL, NULL); - if (!dev) { - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -ENODEV; - } - ret = btrfs_scrub_cancel_dev(fs_info, dev); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - - return ret; -} - int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress) { @@ -3188,16 +3199,18 @@ out: static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) { - unsigned long index; struct scrub_copy_nocow_ctx *nocow_ctx = ctx; - int ret = 0; + struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; struct btrfs_key key; - struct inode *inode = NULL; + struct inode *inode; + struct page *page; struct btrfs_root *local_root; u64 physical_for_dev_replace; u64 len; - struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + unsigned long index; int srcu_index; + int ret; + int err; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; @@ -3211,6 +3224,11 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) return PTR_ERR(local_root); } + if (btrfs_root_refs(&local_root->root_item) == 0) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); + return -ENOENT; + } + key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; @@ -3219,19 +3237,21 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) if (IS_ERR(inode)) return PTR_ERR(inode); + /* Avoid truncate/dio/punch hole.. */ + mutex_lock(&inode->i_mutex); + inode_dio_wait(inode); + + ret = 0; physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; len = nocow_ctx->len; while (len >= PAGE_CACHE_SIZE) { - struct page *page = NULL; - int ret_sub; - index = offset >> PAGE_CACHE_SHIFT; - +again: page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) { pr_err("find_or_create_page() failed\n"); ret = -ENOMEM; - goto next_page; + goto out; } if (PageUptodate(page)) { @@ -3239,39 +3259,49 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) goto next_page; } else { ClearPageError(page); - ret_sub = extent_read_full_page(&BTRFS_I(inode)-> + err = extent_read_full_page(&BTRFS_I(inode)-> io_tree, page, btrfs_get_extent, nocow_ctx->mirror_num); - if (ret_sub) { - ret = ret_sub; + if (err) { + ret = err; goto next_page; } - wait_on_page_locked(page); + + lock_page(page); + /* + * If the page has been remove from the page cache, + * the data on it is meaningless, because it may be + * old one, the new data may be written into the new + * page in the page cache. + */ + if (page->mapping != inode->i_mapping) { + page_cache_release(page); + goto again; + } if (!PageUptodate(page)) { ret = -EIO; goto next_page; } } - ret_sub = write_page_nocow(nocow_ctx->sctx, - physical_for_dev_replace, page); - if (ret_sub) { - ret = ret_sub; - goto next_page; - } - + err = write_page_nocow(nocow_ctx->sctx, + physical_for_dev_replace, page); + if (err) + ret = err; next_page: - if (page) { - unlock_page(page); - put_page(page); - } + unlock_page(page); + page_cache_release(page); + + if (ret) + break; + offset += PAGE_CACHE_SIZE; physical_for_dev_replace += PAGE_CACHE_SIZE; len -= PAGE_CACHE_SIZE; } - - if (inode) - iput(inode); +out: + mutex_unlock(&inode->i_mutex); + iput(inode); return ret; } @@ -3291,7 +3321,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); return -EIO; } - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c85e7c6b4598..d3f3b43cae0b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -158,7 +158,7 @@ static void fs_path_reset(struct fs_path *p) } } -static struct fs_path *fs_path_alloc(struct send_ctx *sctx) +static struct fs_path *fs_path_alloc(void) { struct fs_path *p; @@ -173,11 +173,11 @@ static struct fs_path *fs_path_alloc(struct send_ctx *sctx) return p; } -static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx) +static struct fs_path *fs_path_alloc_reversed(void) { struct fs_path *p; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return NULL; p->reversed = 1; @@ -185,7 +185,7 @@ static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx) return p; } -static void fs_path_free(struct send_ctx *sctx, struct fs_path *p) +static void fs_path_free(struct fs_path *p) { if (!p) return; @@ -387,7 +387,7 @@ static struct btrfs_path *alloc_path_for_send(void) return path; } -int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) +static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) { int ret; mm_segment_t old_fs; @@ -753,8 +753,7 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, * * path must point to the INODE_REF or INODE_EXTREF when called. */ -static int iterate_inode_ref(struct send_ctx *sctx, - struct btrfs_root *root, struct btrfs_path *path, +static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *found_key, int resolve, iterate_inode_ref_t iterate, void *ctx) { @@ -777,13 +776,13 @@ static int iterate_inode_ref(struct send_ctx *sctx, unsigned long elem_size; unsigned long ptr; - p = fs_path_alloc_reversed(sctx); + p = fs_path_alloc_reversed(); if (!p) return -ENOMEM; tmp_path = alloc_path_for_send(); if (!tmp_path) { - fs_path_free(sctx, p); + fs_path_free(p); return -ENOMEM; } @@ -858,7 +857,7 @@ static int iterate_inode_ref(struct send_ctx *sctx, out: btrfs_free_path(tmp_path); - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -874,8 +873,7 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, * * path must point to the dir item when called. */ -static int iterate_dir_item(struct send_ctx *sctx, - struct btrfs_root *root, struct btrfs_path *path, +static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *found_key, iterate_dir_item_t iterate, void *ctx) { @@ -990,7 +988,7 @@ static int __copy_first_ref(int num, u64 dir, int index, * Retrieve the first path of an inode. If an inode has more then one * ref/hardlink, this is ignored. */ -static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root, +static int get_inode_path(struct btrfs_root *root, u64 ino, struct fs_path *path) { int ret; @@ -1022,8 +1020,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root, goto out; } - ret = iterate_inode_ref(sctx, root, p, &found_key, 1, - __copy_first_ref, path); + ret = iterate_inode_ref(root, p, &found_key, 1, + __copy_first_ref, path); if (ret < 0) goto out; ret = 0; @@ -1314,8 +1312,7 @@ out: return ret; } -static int read_symlink(struct send_ctx *sctx, - struct btrfs_root *root, +static int read_symlink(struct btrfs_root *root, u64 ino, struct fs_path *dest) { @@ -1562,8 +1559,7 @@ out: * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, * generation of the parent dir and the name of the dir entry. */ -static int get_first_ref(struct send_ctx *sctx, - struct btrfs_root *root, u64 ino, +static int get_first_ref(struct btrfs_root *root, u64 ino, u64 *dir, u64 *dir_gen, struct fs_path *name) { int ret; @@ -1628,8 +1624,7 @@ out: return ret; } -static int is_first_ref(struct send_ctx *sctx, - struct btrfs_root *root, +static int is_first_ref(struct btrfs_root *root, u64 ino, u64 dir, const char *name, int name_len) { @@ -1638,11 +1633,11 @@ static int is_first_ref(struct send_ctx *sctx, u64 tmp_dir; u64 tmp_dir_gen; - tmp_name = fs_path_alloc(sctx); + tmp_name = fs_path_alloc(); if (!tmp_name) return -ENOMEM; - ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); + ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); if (ret < 0) goto out; @@ -1654,7 +1649,7 @@ static int is_first_ref(struct send_ctx *sctx, ret = !memcmp(tmp_name->start, name, name_len); out: - fs_path_free(sctx, tmp_name); + fs_path_free(tmp_name); return ret; } @@ -1783,11 +1778,11 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) if (!sctx->parent_root) goto out; - name = fs_path_alloc(sctx); + name = fs_path_alloc(); if (!name) return -ENOMEM; - ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name); + ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name); if (ret < 0) goto out; @@ -1795,7 +1790,7 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) name->start, fs_path_len(name)); out: - fs_path_free(sctx, name); + fs_path_free(name); return ret; } @@ -1979,11 +1974,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, * send_root or parent_root for ref lookup. */ if (ino < sctx->send_progress) - ret = get_first_ref(sctx, sctx->send_root, ino, - parent_ino, parent_gen, dest); + ret = get_first_ref(sctx->send_root, ino, + parent_ino, parent_gen, dest); else - ret = get_first_ref(sctx, sctx->parent_root, ino, - parent_ino, parent_gen, dest); + ret = get_first_ref(sctx->parent_root, ino, + parent_ino, parent_gen, dest); if (ret < 0) goto out; @@ -2070,7 +2065,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, u64 parent_gen = 0; int stop = 0; - name = fs_path_alloc(sctx); + name = fs_path_alloc(); if (!name) { ret = -ENOMEM; goto out; @@ -2098,7 +2093,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, } out: - fs_path_free(sctx, name); + fs_path_free(name); if (!ret) fs_path_unreverse(dest); return ret; @@ -2263,7 +2258,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2281,7 +2276,7 @@ verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2292,7 +2287,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2310,7 +2305,7 @@ verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2321,7 +2316,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2340,7 +2335,7 @@ verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2356,7 +2351,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) verbose_printk("btrfs: send_utimes %llu\n", ino); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2397,7 +2392,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); btrfs_free_path(path); return ret; } @@ -2418,7 +2413,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) verbose_printk("btrfs: send_create_inode %llu\n", ino); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2459,7 +2454,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); if (S_ISLNK(mode)) { fs_path_reset(p); - ret = read_symlink(sctx, sctx->send_root, ino, p); + ret = read_symlink(sctx->send_root, ino, p); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); @@ -2476,7 +2471,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2615,13 +2610,13 @@ static int record_ref(struct list_head *head, u64 dir, return 0; } -static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) +static void __free_recorded_refs(struct list_head *head) { struct recorded_ref *cur; while (!list_empty(head)) { cur = list_entry(head->next, struct recorded_ref, list); - fs_path_free(sctx, cur->full_path); + fs_path_free(cur->full_path); list_del(&cur->list); kfree(cur); } @@ -2629,8 +2624,8 @@ static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) static void free_recorded_refs(struct send_ctx *sctx) { - __free_recorded_refs(sctx, &sctx->new_refs); - __free_recorded_refs(sctx, &sctx->deleted_refs); + __free_recorded_refs(&sctx->new_refs); + __free_recorded_refs(&sctx->deleted_refs); } /* @@ -2644,7 +2639,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, int ret; struct fs_path *orphan; - orphan = fs_path_alloc(sctx); + orphan = fs_path_alloc(); if (!orphan) return -ENOMEM; @@ -2655,7 +2650,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, ret = send_rename(sctx, path, orphan); out: - fs_path_free(sctx, orphan); + fs_path_free(orphan); return ret; } @@ -2746,7 +2741,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); */ BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); - valid_path = fs_path_alloc(sctx); + valid_path = fs_path_alloc(); if (!valid_path) { ret = -ENOMEM; goto out; @@ -2843,9 +2838,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; if (ret) { - ret = is_first_ref(sctx, sctx->parent_root, - ow_inode, cur->dir, cur->name, - cur->name_len); + ret = is_first_ref(sctx->parent_root, + ow_inode, cur->dir, cur->name, + cur->name_len); if (ret < 0) goto out; if (ret) { @@ -3024,7 +3019,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); out: free_recorded_refs(sctx); ulist_free(check_dirs); - fs_path_free(sctx, valid_path); + fs_path_free(valid_path); return ret; } @@ -3037,7 +3032,7 @@ static int __record_new_ref(int num, u64 dir, int index, struct fs_path *p; u64 gen; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3057,7 +3052,7 @@ static int __record_new_ref(int num, u64 dir, int index, out: if (ret) - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3070,7 +3065,7 @@ static int __record_deleted_ref(int num, u64 dir, int index, struct fs_path *p; u64 gen; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3090,7 +3085,7 @@ static int __record_deleted_ref(int num, u64 dir, int index, out: if (ret) - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3098,8 +3093,8 @@ static int record_new_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, __record_new_ref, sctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, __record_new_ref, sctx); if (ret < 0) goto out; ret = 0; @@ -3112,8 +3107,8 @@ static int record_deleted_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, __record_deleted_ref, sctx); + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, __record_deleted_ref, sctx); if (ret < 0) goto out; ret = 0; @@ -3142,8 +3137,7 @@ static int __find_iref(int num, u64 dir, int index, return 0; } -static int find_iref(struct send_ctx *sctx, - struct btrfs_root *root, +static int find_iref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, u64 dir, struct fs_path *name) @@ -3155,7 +3149,7 @@ static int find_iref(struct send_ctx *sctx, ctx.name = name; ctx.found_idx = -1; - ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx); + ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx); if (ret < 0) return ret; @@ -3172,7 +3166,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index, int ret; struct send_ctx *sctx = ctx; - ret = find_iref(sctx, sctx->parent_root, sctx->right_path, + ret = find_iref(sctx->parent_root, sctx->right_path, sctx->cmp_key, dir, name); if (ret == -ENOENT) ret = __record_new_ref(num, dir, index, name, sctx); @@ -3189,7 +3183,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index, int ret; struct send_ctx *sctx = ctx; - ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, + ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key, dir, name); if (ret == -ENOENT) ret = __record_deleted_ref(num, dir, index, name, sctx); @@ -3203,11 +3197,11 @@ static int record_changed_ref(struct send_ctx *sctx) { int ret = 0; - ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, __record_changed_new_ref, sctx); if (ret < 0) goto out; - ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); if (ret < 0) goto out; @@ -3266,8 +3260,7 @@ static int process_all_refs(struct send_ctx *sctx, found_key.type != BTRFS_INODE_EXTREF_KEY)) break; - ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb, - sctx); + ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); btrfs_release_path(path); if (ret < 0) goto out; @@ -3335,7 +3328,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, struct fs_path *p; posix_acl_xattr_header dummy_acl; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3362,7 +3355,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, ret = send_set_xattr(sctx, p, name, name_len, data, data_len); out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3375,7 +3368,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key, struct send_ctx *sctx = ctx; struct fs_path *p; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3386,7 +3379,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key, ret = send_remove_xattr(sctx, p, name, name_len); out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3394,8 +3387,8 @@ static int process_new_xattr(struct send_ctx *sctx) { int ret = 0; - ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, - sctx->cmp_key, __process_new_xattr, sctx); + ret = iterate_dir_item(sctx->send_root, sctx->left_path, + sctx->cmp_key, __process_new_xattr, sctx); return ret; } @@ -3404,8 +3397,8 @@ static int process_deleted_xattr(struct send_ctx *sctx) { int ret; - ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, - sctx->cmp_key, __process_deleted_xattr, sctx); + ret = iterate_dir_item(sctx->parent_root, sctx->right_path, + sctx->cmp_key, __process_deleted_xattr, sctx); return ret; } @@ -3429,17 +3422,15 @@ static int __find_xattr(int num, struct btrfs_key *di_key, strncmp(name, ctx->name, name_len) == 0) { ctx->found_idx = num; ctx->found_data_len = data_len; - ctx->found_data = kmalloc(data_len, GFP_NOFS); + ctx->found_data = kmemdup(data, data_len, GFP_NOFS); if (!ctx->found_data) return -ENOMEM; - memcpy(ctx->found_data, data, data_len); return 1; } return 0; } -static int find_xattr(struct send_ctx *sctx, - struct btrfs_root *root, +static int find_xattr(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, const char *name, int name_len, @@ -3454,7 +3445,7 @@ static int find_xattr(struct send_ctx *sctx, ctx.found_data = NULL; ctx.found_data_len = 0; - ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx); + ret = iterate_dir_item(root, path, key, __find_xattr, &ctx); if (ret < 0) return ret; @@ -3479,11 +3470,10 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, struct send_ctx *sctx = ctx; char *found_data = NULL; int found_data_len = 0; - struct fs_path *p = NULL; - ret = find_xattr(sctx, sctx->parent_root, sctx->right_path, - sctx->cmp_key, name, name_len, &found_data, - &found_data_len); + ret = find_xattr(sctx->parent_root, sctx->right_path, + sctx->cmp_key, name, name_len, &found_data, + &found_data_len); if (ret == -ENOENT) { ret = __process_new_xattr(num, di_key, name, name_len, data, data_len, type, ctx); @@ -3498,7 +3488,6 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, } kfree(found_data); - fs_path_free(sctx, p); return ret; } @@ -3510,8 +3499,8 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, int ret; struct send_ctx *sctx = ctx; - ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, - name, name_len, NULL, NULL); + ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key, + name, name_len, NULL, NULL); if (ret == -ENOENT) ret = __process_deleted_xattr(num, di_key, name, name_len, data, data_len, type, ctx); @@ -3525,11 +3514,11 @@ static int process_changed_xattr(struct send_ctx *sctx) { int ret = 0; - ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, + ret = iterate_dir_item(sctx->send_root, sctx->left_path, sctx->cmp_key, __process_changed_new_xattr, sctx); if (ret < 0) goto out; - ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, + ret = iterate_dir_item(sctx->parent_root, sctx->right_path, sctx->cmp_key, __process_changed_deleted_xattr, sctx); out: @@ -3574,8 +3563,8 @@ static int process_all_new_xattrs(struct send_ctx *sctx) goto out; } - ret = iterate_dir_item(sctx, root, path, &found_key, - __process_new_xattr, sctx); + ret = iterate_dir_item(root, path, &found_key, + __process_new_xattr, sctx); if (ret < 0) goto out; @@ -3600,7 +3589,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) int num_read = 0; mm_segment_t old_fs; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3642,7 +3631,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); set_fs(old_fs); if (ret < 0) return ret; @@ -3665,7 +3654,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " clone_root->root->objectid, clone_root->ino, clone_root->offset); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3688,8 +3677,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " goto out; ret = get_cur_path(sctx, clone_root->ino, gen, p); } else { - ret = get_inode_path(sctx, clone_root->root, - clone_root->ino, p); + ret = get_inode_path(clone_root->root, clone_root->ino, p); } if (ret < 0) goto out; @@ -3706,7 +3694,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3719,7 +3707,7 @@ static int send_update_extent(struct send_ctx *sctx, int ret = 0; struct fs_path *p; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3739,7 +3727,7 @@ static int send_update_extent(struct send_ctx *sctx, tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -4529,9 +4517,11 @@ static int send_subvol(struct send_ctx *sctx) { int ret; - ret = send_header(sctx); - if (ret < 0) - goto out; + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) { + ret = send_header(sctx); + if (ret < 0) + goto out; + } ret = send_subvol_begin(sctx); if (ret < 0) @@ -4579,6 +4569,41 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) send_root = BTRFS_I(file_inode(mnt_file))->root; fs_info = send_root->fs_info; + /* + * This is done when we lookup the root, it should already be complete + * by the time we get here. + */ + WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE); + + /* + * If we just created this root we need to make sure that the orphan + * cleanup has been done and committed since we search the commit root, + * so check its commit root transid with our otransid and if they match + * commit the transaction to make sure everything is updated. + */ + down_read(&send_root->fs_info->extent_commit_sem); + if (btrfs_header_generation(send_root->commit_root) == + btrfs_root_otransid(&send_root->root_item)) { + struct btrfs_trans_handle *trans; + + up_read(&send_root->fs_info->extent_commit_sem); + + trans = btrfs_attach_transaction_barrier(send_root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) { + ret = PTR_ERR(trans); + goto out; + } + /* ENOENT means theres no transaction */ + } else { + ret = btrfs_commit_transaction(trans, send_root); + if (ret) + goto out; + } + } else { + up_read(&send_root->fs_info->extent_commit_sem); + } + arg = memdup_user(arg_, sizeof(*arg)); if (IS_ERR(arg)) { ret = PTR_ERR(arg); @@ -4593,7 +4618,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } - if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) { + if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { ret = -EINVAL; goto out; } @@ -4612,8 +4637,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->flags = arg->flags; sctx->send_filp = fget(arg->send_fd); - if (IS_ERR(sctx->send_filp)) { - ret = PTR_ERR(sctx->send_filp); + if (!sctx->send_filp) { + ret = -EBADF; goto out; } @@ -4663,10 +4688,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; clone_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (!clone_root) { - ret = -EINVAL; - goto out; - } if (IS_ERR(clone_root)) { ret = PTR_ERR(clone_root); goto out; @@ -4682,8 +4703,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (!sctx->parent_root) { - ret = -EINVAL; + if (IS_ERR(sctx->parent_root)) { + ret = PTR_ERR(sctx->parent_root); goto out; } } @@ -4704,12 +4725,14 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) if (ret < 0) goto out; - ret = begin_cmd(sctx, BTRFS_SEND_C_END); - if (ret < 0) - goto out; - ret = send_cmd(sctx); - if (ret < 0) - goto out; + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { + ret = begin_cmd(sctx, BTRFS_SEND_C_END); + if (ret < 0) + goto out; + ret = send_cmd(sctx); + if (ret < 0) + goto out; + } out: kfree(arg); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 8bb18f7ccaa6..48d425aef05b 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -131,5 +131,4 @@ enum { #ifdef __KERNEL__ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); -int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f6b88595f858..8eb6191d86da 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -51,11 +51,11 @@ #include "print-tree.h" #include "xattr.h" #include "volumes.h" -#include "version.h" #include "export.h" #include "compression.h" #include "rcu-string.h" #include "dev-replace.h" +#include "free-space-cache.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -63,9 +63,9 @@ static const struct super_operations btrfs_super_ops; static struct file_system_type btrfs_fs_type; -static const char *btrfs_decode_error(int errno, char nbuf[16]) +static const char *btrfs_decode_error(int errno) { - char *errstr = NULL; + char *errstr = "unknown"; switch (errno) { case -EIO: @@ -80,18 +80,18 @@ static const char *btrfs_decode_error(int errno, char nbuf[16]) case -EEXIST: errstr = "Object already exists"; break; - default: - if (nbuf) { - if (snprintf(nbuf, 16, "error %d", -errno) >= 0) - errstr = nbuf; - } + case -ENOSPC: + errstr = "No space left"; + break; + case -ENOENT: + errstr = "No such entry"; break; } return errstr; } -static void __save_error_info(struct btrfs_fs_info *fs_info) +static void save_error_info(struct btrfs_fs_info *fs_info) { /* * today we only save the error info into ram. Long term we'll @@ -100,11 +100,6 @@ static void __save_error_info(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); } -static void save_error_info(struct btrfs_fs_info *fs_info) -{ - __save_error_info(fs_info); -} - /* btrfs handle error by forcing the filesystem readonly */ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) { @@ -115,7 +110,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { sb->s_flags |= MS_RDONLY; - printk(KERN_INFO "btrfs is forced readonly\n"); + btrfs_info(fs_info, "forced readonly"); /* * Note that a running device replace operation is not * canceled here although there is no way to update @@ -126,7 +121,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) * mounted writeable again, the device replace * operation continues. */ -// WARN_ON(1); } } @@ -139,7 +133,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { struct super_block *sb = fs_info->sb; - char nbuf[16]; const char *errstr; /* @@ -149,7 +142,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) return; - errstr = btrfs_decode_error(errno, nbuf); + errstr = btrfs_decode_error(errno); if (fmt) { struct va_format vaf; va_list args; @@ -158,19 +151,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n", - sb->s_id, function, line, errstr, &vaf); + printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s (%pV)\n", + sb->s_id, function, line, errno, errstr, &vaf); va_end(args); } else { - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", - sb->s_id, function, line, errstr); + printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s\n", + sb->s_id, function, line, errno, errstr); } /* Don't go through full error handling during mount */ - if (sb->s_flags & MS_BORN) { - save_error_info(fs_info); + save_error_info(fs_info); + if (sb->s_flags & MS_BORN) btrfs_handle_error(fs_info); - } } static const char * const logtypes[] = { @@ -184,7 +176,7 @@ static const char * const logtypes[] = { "debug", }; -void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) +void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { struct super_block *sb = fs_info->sb; char lvl[4]; @@ -208,7 +200,7 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf); + printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); va_end(args); } @@ -252,21 +244,30 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno) { - WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n"); + /* + * Report first abort since mount + */ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, + &root->fs_info->fs_state)) { + WARN(1, KERN_DEBUG "btrfs: Transaction aborted (error %d)\n", + errno); + } trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ if (!trans->blocks_used) { - char nbuf[16]; const char *errstr; - errstr = btrfs_decode_error(errno, nbuf); - btrfs_printk(root->fs_info, - "%s:%d: Aborting unused transaction(%s).\n", - function, line, errstr); + errstr = btrfs_decode_error(errno); + btrfs_warn(root->fs_info, + "%s:%d: Aborting unused transaction(%s).", + function, line, errstr); return; } ACCESS_ONCE(trans->transaction->aborted) = errno; + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&root->fs_info->transaction_wait); + wake_up(&root->fs_info->transaction_blocked_wait); __btrfs_std_error(root->fs_info, function, line, errno, NULL); } /* @@ -276,7 +277,6 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { - char nbuf[16]; char *s_id = "<unknown>"; const char *errstr; struct va_format vaf = { .fmt = fmt }; @@ -288,13 +288,13 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, va_start(args, fmt); vaf.va = &args; - errstr = btrfs_decode_error(errno, nbuf); + errstr = btrfs_decode_error(errno); if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)) - panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", - s_id, function, line, &vaf, errstr); + panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", + s_id, function, line, &vaf, errno, errstr); - printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", - s_id, function, line, &vaf, errstr); + printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", + s_id, function, line, &vaf, errno, errstr); va_end(args); /* Caller calls BUG() */ } @@ -650,7 +650,7 @@ out: */ static int btrfs_parse_early_options(const char *options, fmode_t flags, void *holder, char **subvol_name, u64 *subvol_objectid, - u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) + struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; char *device_name, *opts, *orig, *p; @@ -693,16 +693,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, } break; case Opt_subvolrootid: - intarg = 0; - error = match_int(&args[0], &intarg); - if (!error) { - /* we want the original fs_tree */ - if (!intarg) - *subvol_rootid = - BTRFS_FS_TREE_OBJECTID; - else - *subvol_rootid = intarg; - } + printk(KERN_WARNING + "btrfs: 'subvolrootid' mount option is deprecated and has no effect\n"); break; case Opt_device: device_name = match_strdup(&args[0]); @@ -786,9 +778,6 @@ find_root: if (IS_ERR(new_root)) return ERR_CAST(new_root); - if (btrfs_root_refs(&new_root->root_item) == 0) - return ERR_PTR(-ENOENT); - dir_id = btrfs_root_dirid(&new_root->root_item); setup_root: location.objectid = dir_id; @@ -876,7 +865,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(fs_info, 1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -1080,7 +1069,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, fmode_t mode = FMODE_READ; char *subvol_name = NULL; u64 subvol_objectid = 0; - u64 subvol_rootid = 0; int error = 0; if (!(flags & MS_RDONLY)) @@ -1088,7 +1076,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_parse_early_options(data, mode, fs_type, &subvol_name, &subvol_objectid, - &subvol_rootid, &fs_devices); + &fs_devices); if (error) { kfree(subvol_name); return ERR_PTR(error); @@ -1202,11 +1190,14 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, new_pool_size); } -static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info, - unsigned long old_opts, int flags) +static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) { set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); +} +static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, + unsigned long old_opts, int flags) +{ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || (flags & MS_RDONLY))) { @@ -1247,7 +1238,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) unsigned int old_metadata_ratio = fs_info->metadata_ratio; int ret; - btrfs_remount_prepare(fs_info, old_opts, *flags); + btrfs_remount_prepare(fs_info); ret = btrfs_parse_options(root, data); if (ret) { @@ -1255,6 +1246,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } + btrfs_remount_begin(fs_info, old_opts, *flags); btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); @@ -1270,6 +1262,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_dev_replace_suspend_for_unmount(fs_info); btrfs_scrub_cancel(fs_info); + btrfs_pause_balance(fs_info); ret = btrfs_commit_super(root); if (ret) @@ -1691,6 +1684,18 @@ static void btrfs_interface_exit(void) printk(KERN_INFO "btrfs: misc_deregister failed for control device\n"); } +static void btrfs_print_info(void) +{ + printk(KERN_INFO "Btrfs loaded" +#ifdef CONFIG_BTRFS_DEBUG + ", debug=on" +#endif +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + ", integrity-checker=on" +#endif + "\n"); +} + static int __init init_btrfs_fs(void) { int err; @@ -1739,7 +1744,9 @@ static int __init init_btrfs_fs(void) btrfs_init_lockdep(); - printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION); + btrfs_print_info(); + btrfs_test_free_space_cache(); + return 0; unregister_ioctl: diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 50767bbaad6c..d58cce77fc6c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -34,12 +34,43 @@ #define BTRFS_ROOT_TRANS_TAG 0 -void put_transaction(struct btrfs_transaction *transaction) +static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { + [TRANS_STATE_RUNNING] = 0U, + [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE | + __TRANS_START), + [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH), + [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN), + [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK), + [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK), +}; + +static void put_transaction(struct btrfs_transaction *transaction) { WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(transaction->delayed_refs.root.rb_node); + while (!list_empty(&transaction->pending_chunks)) { + struct extent_map *em; + + em = list_first_entry(&transaction->pending_chunks, + struct extent_map, list); + list_del_init(&em->list); + free_extent_map(em); + } kmem_cache_free(btrfs_transaction_cachep, transaction); } } @@ -50,18 +81,35 @@ static noinline void switch_commit_root(struct btrfs_root *root) root->commit_root = btrfs_root_node(root); } -static inline int can_join_transaction(struct btrfs_transaction *trans, - int type) +static inline void extwriter_counter_inc(struct btrfs_transaction *trans, + unsigned int type) { - return !(trans->in_commit && - type != TRANS_JOIN && - type != TRANS_JOIN_NOLOCK); + if (type & TRANS_EXTWRITERS) + atomic_inc(&trans->num_extwriters); +} + +static inline void extwriter_counter_dec(struct btrfs_transaction *trans, + unsigned int type) +{ + if (type & TRANS_EXTWRITERS) + atomic_dec(&trans->num_extwriters); +} + +static inline void extwriter_counter_init(struct btrfs_transaction *trans, + unsigned int type) +{ + atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0)); +} + +static inline int extwriter_counter_read(struct btrfs_transaction *trans) +{ + return atomic_read(&trans->num_extwriters); } /* * either allocate a new transaction or hop into the existing one */ -static noinline int join_transaction(struct btrfs_root *root, int type) +static noinline int join_transaction(struct btrfs_root *root, unsigned int type) { struct btrfs_transaction *cur_trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -74,32 +122,19 @@ loop: return -EROFS; } - if (fs_info->trans_no_join) { - /* - * If we are JOIN_NOLOCK we're already committing a current - * transaction, we just need a handle to deal with something - * when committing the transaction, such as inode cache and - * space cache. It is a special case. - */ - if (type != TRANS_JOIN_NOLOCK) { - spin_unlock(&fs_info->trans_lock); - return -EBUSY; - } - } - cur_trans = fs_info->running_transaction; if (cur_trans) { if (cur_trans->aborted) { spin_unlock(&fs_info->trans_lock); return cur_trans->aborted; } - if (!can_join_transaction(cur_trans, type)) { + if (btrfs_blocked_trans_types[cur_trans->state] & type) { spin_unlock(&fs_info->trans_lock); return -EBUSY; } atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); - cur_trans->num_joined++; + extwriter_counter_inc(cur_trans, type); spin_unlock(&fs_info->trans_lock); return 0; } @@ -112,6 +147,12 @@ loop: if (type == TRANS_ATTACH) return -ENOENT; + /* + * JOIN_NOLOCK only happens during the transaction commit, so + * it is impossible that ->running_transaction is NULL + */ + BUG_ON(type == TRANS_JOIN_NOLOCK); + cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); if (!cur_trans) return -ENOMEM; @@ -120,7 +161,7 @@ loop: if (fs_info->running_transaction) { /* * someone started a transaction after we unlocked. Make sure - * to redo the trans_no_join checks above + * to redo the checks above */ kmem_cache_free(btrfs_transaction_cachep, cur_trans); goto loop; @@ -131,17 +172,15 @@ loop: } atomic_set(&cur_trans->num_writers, 1); - cur_trans->num_joined = 0; + extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); init_waitqueue_head(&cur_trans->commit_wait); - cur_trans->in_commit = 0; - cur_trans->blocked = 0; + cur_trans->state = TRANS_STATE_RUNNING; /* * One for this trans handle, one so it will live on until we * commit the transaction. */ atomic_set(&cur_trans->use_count, 2); - cur_trans->commit_done = 0; cur_trans->start_time = get_seconds(); cur_trans->delayed_refs.root = RB_ROOT; @@ -162,9 +201,8 @@ loop: if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when " "creating a fresh transaction\n"); - atomic_set(&fs_info->tree_mod_seq, 0); + atomic64_set(&fs_info->tree_mod_seq, 0); - spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); atomic_set(&cur_trans->delayed_refs.ref_seq, 0); @@ -172,6 +210,7 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->ordered_operations); + INIT_LIST_HEAD(&cur_trans->pending_chunks); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -269,6 +308,13 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, return 0; } +static inline int is_transaction_blocked(struct btrfs_transaction *trans) +{ + return (trans->state >= TRANS_STATE_BLOCKED && + trans->state < TRANS_STATE_UNBLOCKED && + !trans->aborted); +} + /* wait for commit against the current transaction to become unblocked * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. @@ -279,12 +325,13 @@ static void wait_current_trans(struct btrfs_root *root) spin_lock(&root->fs_info->trans_lock); cur_trans = root->fs_info->running_transaction; - if (cur_trans && cur_trans->blocked) { + if (cur_trans && is_transaction_blocked(cur_trans)) { atomic_inc(&cur_trans->use_count); spin_unlock(&root->fs_info->trans_lock); wait_event(root->fs_info->transaction_wait, - !cur_trans->blocked); + cur_trans->state >= TRANS_STATE_UNBLOCKED || + cur_trans->aborted); put_transaction(cur_trans); } else { spin_unlock(&root->fs_info->trans_lock); @@ -307,7 +354,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type) } static struct btrfs_trans_handle * -start_transaction(struct btrfs_root *root, u64 num_items, int type, +start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, enum btrfs_reserve_flush_enum flush) { struct btrfs_trans_handle *h; @@ -320,7 +367,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, return ERR_PTR(-EROFS); if (current->journal_info) { - WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); + WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; h->use_count++; WARN_ON(h->use_count > 2); @@ -366,7 +413,7 @@ again: * If we are ATTACH, it means we just want to catch the current * transaction and commit it, so we needn't do sb_start_intwrite(). */ - if (type < TRANS_JOIN_NOLOCK) + if (type & __TRANS_FREEZABLE) sb_start_intwrite(root->fs_info->sb); if (may_wait_transaction(root, type)) @@ -408,7 +455,8 @@ again: INIT_LIST_HEAD(&h->new_bgs); smp_mb(); - if (cur_trans->blocked && may_wait_transaction(root, type)) { + if (cur_trans->state >= TRANS_STATE_BLOCKED && + may_wait_transaction(root, type)) { btrfs_commit_transaction(h, root); goto again; } @@ -429,7 +477,7 @@ got_it: return h; join_fail: - if (type < TRANS_JOIN_NOLOCK) + if (type & __TRANS_FREEZABLE) sb_end_intwrite(root->fs_info->sb); kmem_cache_free(btrfs_trans_handle_cachep, h); alloc_fail: @@ -490,7 +538,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) } /* - * btrfs_attach_transaction() - catch the running transaction + * btrfs_attach_transaction_barrier() - catch the running transaction * * It is similar to the above function, the differentia is this one * will wait for all the inactive transactions until they fully @@ -512,7 +560,7 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) static noinline void wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) { - wait_event(commit->commit_wait, commit->commit_done); + wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED); } int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) @@ -548,8 +596,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) spin_lock(&root->fs_info->trans_lock); list_for_each_entry_reverse(t, &root->fs_info->trans_list, list) { - if (t->in_commit) { - if (t->commit_done) + if (t->state >= TRANS_STATE_COMMIT_START) { + if (t->state == TRANS_STATE_COMPLETED) break; cur_trans = t; atomic_inc(&cur_trans->use_count); @@ -576,10 +624,11 @@ void btrfs_throttle(struct btrfs_root *root) static int should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - int ret; + if (root->fs_info->global_block_rsv.space_info->full && + btrfs_should_throttle_delayed_refs(trans, root)) + return 1; - ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); - return ret ? 1 : 0; + return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); } int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, @@ -590,7 +639,8 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, int err; smp_mb(); - if (cur_trans->blocked || cur_trans->delayed_refs.flushing) + if (cur_trans->state >= TRANS_STATE_BLOCKED || + cur_trans->delayed_refs.flushing) return 1; updates = trans->delayed_ref_updates; @@ -609,7 +659,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; - int count = 0; + unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; @@ -638,17 +688,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - while (count < 1) { - unsigned long cur = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (btrfs_should_throttle_delayed_refs(trans, root)) { + cur = max_t(unsigned long, cur, 1); trans->delayed_ref_updates = 0; - if (cur && - trans->transaction->delayed_refs.num_heads_ready > 64) { - trans->delayed_ref_updates = 0; - btrfs_run_delayed_refs(trans, root, cur); - } else { - break; - } - count++; + btrfs_run_delayed_refs(trans, root, cur); } btrfs_trans_release_metadata(trans, root); @@ -658,12 +702,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_create_pending_block_groups(trans, root); if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && - should_end_transaction(trans, root)) { - trans->transaction->blocked = 1; - smp_wmb(); + should_end_transaction(trans, root) && + ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { + spin_lock(&info->trans_lock); + if (cur_trans->state == TRANS_STATE_RUNNING) + cur_trans->state = TRANS_STATE_BLOCKED; + spin_unlock(&info->trans_lock); } - if (lock && cur_trans->blocked && !cur_trans->in_commit) { + if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { if (throttle) { /* * We may race with somebody else here so end up having @@ -677,12 +724,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } } - if (trans->type < TRANS_JOIN_NOLOCK) + if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(root->fs_info->sb); WARN_ON(cur_trans != info->running_transaction); WARN_ON(atomic_read(&cur_trans->num_writers) < 1); atomic_dec(&cur_trans->num_writers); + extwriter_counter_dec(cur_trans, trans->type); smp_mb(); if (waitqueue_active(&cur_trans->writer_wait)) @@ -707,23 +755,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - int ret; - - ret = __btrfs_end_transaction(trans, root, 0); - if (ret) - return ret; - return 0; + return __btrfs_end_transaction(trans, root, 0); } int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - int ret; - - ret = __btrfs_end_transaction(trans, root, 1); - if (ret) - return ret; - return 0; + return __btrfs_end_transaction(trans, root, 1); } int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, @@ -746,9 +784,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_state *cached_state = NULL; u64 start = 0; u64 end; - struct blk_plug plug; - blk_start_plug(&plug); while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, @@ -762,7 +798,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root, } if (err) werr = err; - blk_finish_plug(&plug); return werr; } @@ -807,8 +842,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, { int ret; int ret2; + struct blk_plug plug; + blk_start_plug(&plug); ret = btrfs_write_marked_extents(root, dirty_pages, mark); + blk_finish_plug(&plug); ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); if (ret) @@ -948,7 +986,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, int btrfs_add_dead_root(struct btrfs_root *root) { spin_lock(&root->fs_info->trans_lock); - list_add(&root->root_list, &root->fs_info->dead_roots); + list_add_tail(&root->root_list, &root->fs_info->dead_roots); spin_unlock(&root->fs_info->trans_lock); return 0; } @@ -1179,13 +1217,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); memcpy(new_root_item->parent_uuid, root->root_item.uuid, BTRFS_UUID_SIZE); + if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) { + memset(new_root_item->received_uuid, 0, + sizeof(new_root_item->received_uuid)); + memset(&new_root_item->stime, 0, sizeof(new_root_item->stime)); + memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime)); + btrfs_set_root_stransid(new_root_item, 0); + btrfs_set_root_rtransid(new_root_item, 0); + } new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec); new_root_item->otime.nsec = cpu_to_le32(cur_time.tv_nsec); btrfs_set_root_otransid(new_root_item, trans->transid); - memset(&new_root_item->stime, 0, sizeof(new_root_item->stime)); - memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime)); - btrfs_set_root_stransid(new_root_item, 0); - btrfs_set_root_rtransid(new_root_item, 0); old = btrfs_lock_root_node(root); ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); @@ -1324,20 +1366,26 @@ static void update_super_roots(struct btrfs_root *root) int btrfs_transaction_in_commit(struct btrfs_fs_info *info) { + struct btrfs_transaction *trans; int ret = 0; + spin_lock(&info->trans_lock); - if (info->running_transaction) - ret = info->running_transaction->in_commit; + trans = info->running_transaction; + if (trans) + ret = (trans->state >= TRANS_STATE_COMMIT_START); spin_unlock(&info->trans_lock); return ret; } int btrfs_transaction_blocked(struct btrfs_fs_info *info) { + struct btrfs_transaction *trans; int ret = 0; + spin_lock(&info->trans_lock); - if (info->running_transaction) - ret = info->running_transaction->blocked; + trans = info->running_transaction; + if (trans) + ret = is_transaction_blocked(trans); spin_unlock(&info->trans_lock); return ret; } @@ -1349,7 +1397,9 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) static void wait_current_trans_commit_start(struct btrfs_root *root, struct btrfs_transaction *trans) { - wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit); + wait_event(root->fs_info->transaction_blocked_wait, + trans->state >= TRANS_STATE_COMMIT_START || + trans->aborted); } /* @@ -1360,7 +1410,8 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, struct btrfs_transaction *trans) { wait_event(root->fs_info->transaction_wait, - trans->commit_done || (trans->in_commit && !trans->blocked)); + trans->state >= TRANS_STATE_UNBLOCKED || + trans->aborted); } /* @@ -1456,26 +1507,31 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->trans_lock); - if (list_empty(&cur_trans->list)) { - spin_unlock(&root->fs_info->trans_lock); - btrfs_end_transaction(trans, root); - return; - } + /* + * If the transaction is removed from the list, it means this + * transaction has been committed successfully, so it is impossible + * to call the cleanup function. + */ + BUG_ON(list_empty(&cur_trans->list)); list_del_init(&cur_trans->list); if (cur_trans == root->fs_info->running_transaction) { - root->fs_info->trans_no_join = 1; + cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&root->fs_info->trans_lock); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); spin_lock(&root->fs_info->trans_lock); - root->fs_info->running_transaction = NULL; } spin_unlock(&root->fs_info->trans_lock); btrfs_cleanup_one_transaction(trans->transaction, root); + spin_lock(&root->fs_info->trans_lock); + if (cur_trans == root->fs_info->running_transaction) + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->trans_lock); + put_transaction(cur_trans); put_transaction(cur_trans); @@ -1492,24 +1548,8 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); - int snap_pending = 0; int ret; - if (!flush_on_commit) { - spin_lock(&root->fs_info->trans_lock); - if (!list_empty(&trans->transaction->pending_snapshots)) - snap_pending = 1; - spin_unlock(&root->fs_info->trans_lock); - } - - if (flush_on_commit || snap_pending) { - ret = btrfs_start_delalloc_inodes(root, 1); - if (ret) - return ret; - btrfs_wait_ordered_extents(root, 1); - } - ret = btrfs_run_delayed_items(trans, root); if (ret) return ret; @@ -1533,23 +1573,25 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, return ret; } -/* - * btrfs_transaction state sequence: - * in_commit = 0, blocked = 0 (initial) - * in_commit = 1, blocked = 1 - * blocked = 0 - * commit_done = 1 - */ +static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) +{ + if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + return btrfs_start_all_delalloc_inodes(fs_info, 1); + return 0; +} + +static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) +{ + if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + btrfs_wait_all_ordered_extents(fs_info, 1); +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - unsigned long joined = 0; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; - DEFINE_WAIT(wait); int ret; - int should_grow = 0; - unsigned long now = get_seconds(); ret = btrfs_run_ordered_operations(trans, root, 0); if (ret) { @@ -1588,6 +1630,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * start sending their work down. */ cur_trans->delayed_refs.flushing = 1; + smp_wmb(); if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); @@ -1598,9 +1641,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } - spin_lock(&cur_trans->commit_lock); - if (cur_trans->in_commit) { - spin_unlock(&cur_trans->commit_lock); + spin_lock(&root->fs_info->trans_lock); + if (cur_trans->state >= TRANS_STATE_COMMIT_START) { + spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); ret = btrfs_end_transaction(trans, root); @@ -1611,16 +1654,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } - trans->transaction->in_commit = 1; - trans->transaction->blocked = 1; - spin_unlock(&cur_trans->commit_lock); + cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&root->fs_info->transaction_blocked_wait); - spin_lock(&root->fs_info->trans_lock); if (cur_trans->list.prev != &root->fs_info->trans_list) { prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); - if (!prev_trans->commit_done) { + if (prev_trans->state != TRANS_STATE_COMPLETED) { atomic_inc(&prev_trans->use_count); spin_unlock(&root->fs_info->trans_lock); @@ -1634,42 +1674,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, spin_unlock(&root->fs_info->trans_lock); } - if (!btrfs_test_opt(root, SSD) && - (now < cur_trans->start_time || now - cur_trans->start_time < 1)) - should_grow = 1; - - do { - joined = cur_trans->num_joined; - - WARN_ON(cur_trans != trans->transaction); + extwriter_counter_dec(cur_trans, trans->type); - ret = btrfs_flush_all_pending_stuffs(trans, root); - if (ret) - goto cleanup_transaction; - - prepare_to_wait(&cur_trans->writer_wait, &wait, - TASK_UNINTERRUPTIBLE); + ret = btrfs_start_delalloc_flush(root->fs_info); + if (ret) + goto cleanup_transaction; - if (atomic_read(&cur_trans->num_writers) > 1) - schedule_timeout(MAX_SCHEDULE_TIMEOUT); - else if (should_grow) - schedule_timeout(1); + ret = btrfs_flush_all_pending_stuffs(trans, root); + if (ret) + goto cleanup_transaction; - finish_wait(&cur_trans->writer_wait, &wait); - } while (atomic_read(&cur_trans->num_writers) > 1 || - (should_grow && cur_trans->num_joined != joined)); + wait_event(cur_trans->writer_wait, + extwriter_counter_read(cur_trans) == 0); + /* some pending stuffs might be added after the previous flush. */ ret = btrfs_flush_all_pending_stuffs(trans, root); if (ret) goto cleanup_transaction; + btrfs_wait_delalloc_flush(root->fs_info); /* * Ok now we need to make sure to block out any other joins while we * commit the transaction. We could have started a join before setting - * no_join so make sure to wait for num_writers to == 1 again. + * COMMIT_DOING so make sure to wait for num_writers to == 1 again. */ spin_lock(&root->fs_info->trans_lock); - root->fs_info->trans_no_join = 1; + cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&root->fs_info->trans_lock); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); @@ -1796,10 +1826,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, sizeof(*root->fs_info->super_copy)); - trans->transaction->blocked = 0; spin_lock(&root->fs_info->trans_lock); + cur_trans->state = TRANS_STATE_UNBLOCKED; root->fs_info->running_transaction = NULL; - root->fs_info->trans_no_join = 0; spin_unlock(&root->fs_info->trans_lock); mutex_unlock(&root->fs_info->reloc_mutex); @@ -1808,7 +1837,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_transaction(trans, root); if (ret) { btrfs_error(root->fs_info, ret, - "Error while writing out transaction."); + "Error while writing out transaction"); mutex_unlock(&root->fs_info->tree_log_mutex); goto cleanup_transaction; } @@ -1827,10 +1856,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); - cur_trans->commit_done = 1; - root->fs_info->last_trans_committed = cur_trans->transid; - + /* + * We needn't acquire the lock here because there is no other task + * which can change it. + */ + cur_trans->state = TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); spin_lock(&root->fs_info->trans_lock); @@ -1840,7 +1871,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, put_transaction(cur_trans); put_transaction(cur_trans); - if (trans->type < TRANS_JOIN_NOLOCK) + if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(root->fs_info->sb); trace_btrfs_transaction_commit(root); @@ -1864,8 +1895,7 @@ cleanup_transaction: btrfs_qgroup_free(root, trans->qgroup_reserved); trans->qgroup_reserved = 0; } - btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); -// WARN_ON(1); + btrfs_warn(root->fs_info, "Skipping commit of aborted transaction."); if (current->journal_info == trans) current->journal_info = NULL; cleanup_transaction(trans, root, ret); @@ -1874,31 +1904,44 @@ cleanup_transaction: } /* - * interface function to delete all the snapshots we have scheduled for deletion + * return < 0 if error + * 0 if there are no more dead_roots at the time of call + * 1 there are more to be processed, call me again + * + * The return value indicates there are certainly more snapshots to delete, but + * if there comes a new one during processing, it may return 0. We don't mind, + * because btrfs_commit_super will poke cleaner thread and it will process it a + * few seconds later. */ -int btrfs_clean_old_snapshots(struct btrfs_root *root) +int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) { - LIST_HEAD(list); + int ret; struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&fs_info->trans_lock); - list_splice_init(&fs_info->dead_roots, &list); + if (list_empty(&fs_info->dead_roots)) { + spin_unlock(&fs_info->trans_lock); + return 0; + } + root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + list_del(&root->root_list); spin_unlock(&fs_info->trans_lock); - while (!list_empty(&list)) { - int ret; - - root = list_entry(list.next, struct btrfs_root, root_list); - list_del(&root->root_list); + pr_debug("btrfs: cleaner removing %llu\n", + (unsigned long long)root->objectid); - btrfs_kill_all_delayed_nodes(root); + btrfs_kill_all_delayed_nodes(root); - if (btrfs_header_backref_rev(root->node) < - BTRFS_MIXED_BACKREF_REV) - ret = btrfs_drop_snapshot(root, NULL, 0, 0); - else - ret =btrfs_drop_snapshot(root, NULL, 1, 0); - BUG_ON(ret < 0); - } - return 0; + if (btrfs_header_backref_rev(root->node) < + BTRFS_MIXED_BACKREF_REV) + ret = btrfs_drop_snapshot(root, NULL, 0, 0); + else + ret = btrfs_drop_snapshot(root, NULL, 1, 0); + /* + * If we encounter a transaction abort during snapshot cleaning, we + * don't want to crash here + */ + BUG_ON(ret < 0 && ret != -EAGAIN && ret != -EROFS); + return 1; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 3c8e0d25c8e4..005b0375d18c 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -22,21 +22,33 @@ #include "delayed-ref.h" #include "ctree.h" +enum btrfs_trans_state { + TRANS_STATE_RUNNING = 0, + TRANS_STATE_BLOCKED = 1, + TRANS_STATE_COMMIT_START = 2, + TRANS_STATE_COMMIT_DOING = 3, + TRANS_STATE_UNBLOCKED = 4, + TRANS_STATE_COMPLETED = 5, + TRANS_STATE_MAX = 6, +}; + struct btrfs_transaction { u64 transid; /* + * total external writers(USERSPACE/START/ATTACH) in this + * transaction, it must be zero before the transaction is + * being committed + */ + atomic_t num_extwriters; + /* * total writers in this transaction, it must be zero before the * transaction can end */ atomic_t num_writers; atomic_t use_count; - unsigned long num_joined; - - spinlock_t commit_lock; - int in_commit; - int commit_done; - int blocked; + /* Be protected by fs_info->trans_lock when we want to change it. */ + enum btrfs_trans_state state; struct list_head list; struct extent_io_tree dirty_pages; unsigned long start_time; @@ -44,17 +56,27 @@ struct btrfs_transaction { wait_queue_head_t commit_wait; struct list_head pending_snapshots; struct list_head ordered_operations; + struct list_head pending_chunks; struct btrfs_delayed_ref_root delayed_refs; int aborted; }; -enum btrfs_trans_type { - TRANS_START, - TRANS_JOIN, - TRANS_USERSPACE, - TRANS_JOIN_NOLOCK, - TRANS_ATTACH, -}; +#define __TRANS_FREEZABLE (1U << 0) + +#define __TRANS_USERSPACE (1U << 8) +#define __TRANS_START (1U << 9) +#define __TRANS_ATTACH (1U << 10) +#define __TRANS_JOIN (1U << 11) +#define __TRANS_JOIN_NOLOCK (1U << 12) + +#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE) +#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) +#define TRANS_ATTACH (__TRANS_ATTACH) +#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE) +#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK) + +#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ + __TRANS_ATTACH) struct btrfs_trans_handle { u64 transid; @@ -70,7 +92,7 @@ struct btrfs_trans_handle { short aborted; short adding_csums; bool allocating_chunk; - enum btrfs_trans_type type; + unsigned int type; /* * this root is only needed to validate that the root passed to * start_transaction is the same as the one passed to end_transaction. @@ -123,7 +145,7 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, int btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); -int btrfs_clean_old_snapshots(struct btrfs_root *root); +int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, @@ -146,5 +168,4 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); -void put_transaction(struct btrfs_transaction *transaction); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ef96381569a4..2c6791493637 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -18,6 +18,7 @@ #include <linux/sched.h> #include <linux/slab.h> +#include <linux/blkdev.h> #include <linux/list_sort.h> #include "ctree.h" #include "transaction.h" @@ -277,17 +278,31 @@ static int process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen) { + int ret = 0; + + /* + * If this fs is mixed then we need to be able to process the leaves to + * pin down any logged extents, so we have to read the block. + */ + if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { + ret = btrfs_read_buffer(eb, gen); + if (ret) + return ret; + } + if (wc->pin) - btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, - eb->start, eb->len); + ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, + eb->start, eb->len); - if (btrfs_buffer_uptodate(eb, gen, 0)) { + if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { + if (wc->pin && btrfs_header_level(eb) == 0) + ret = btrfs_exclude_logged_extents(log, eb); if (wc->write) btrfs_write_tree_block(eb); if (wc->wait) btrfs_wait_tree_block_writeback(eb); } - return 0; + return ret; } /* @@ -408,9 +423,9 @@ insert: found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); if (found_size > item_size) - btrfs_truncate_item(trans, root, path, item_size, 1); + btrfs_truncate_item(root, path, item_size, 1); else if (found_size < item_size) - btrfs_extend_item(trans, root, path, + btrfs_extend_item(root, path, item_size - found_size); } else if (ret) { return ret; @@ -587,7 +602,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); - BUG_ON(ret); + if (ret) + goto out; if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -597,7 +613,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); - BUG_ON(ret); + if (ret) + goto out; dest_offset = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); copy_extent_buffer(path->nodes[0], eb, dest_offset, @@ -623,7 +640,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.objectid, ins.offset, 0, root->root_key.objectid, key->objectid, offset, 0); - BUG_ON(ret); + if (ret) + goto out; } else { /* * insert the extent pointer in the extent @@ -632,7 +650,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_alloc_logged_file_extent(trans, root, root->root_key.objectid, key->objectid, offset, &ins); - BUG_ON(ret); + if (ret) + goto out; } btrfs_release_path(path); @@ -649,26 +668,30 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_csums_range(root->log_root, csum_start, csum_end - 1, &ordered_sums, 0); - BUG_ON(ret); + if (ret) + goto out; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums; sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); - ret = btrfs_csum_file_blocks(trans, + if (!ret) + ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums); - BUG_ON(ret); list_del(&sums->list); kfree(sums); } + if (ret) + goto out; } else { btrfs_release_path(path); } } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { /* inline extents are easy, we just overwrite them */ ret = overwrite_item(trans, root, path, eb, slot, key); - BUG_ON(ret); + if (ret) + goto out; } inode_add_bytes(inode, nbytes); @@ -713,20 +736,21 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, inode = read_one_inode(root, location.objectid); if (!inode) { - kfree(name); - return -EIO; + ret = -EIO; + goto out; } ret = link_to_fixup_dir(trans, root, path, location.objectid); - BUG_ON(ret); + if (ret) + goto out; ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); - BUG_ON(ret); + if (ret) + goto out; + btrfs_run_delayed_items(trans, root); +out: kfree(name); - iput(inode); - - btrfs_run_delayed_items(trans, root); return ret; } @@ -879,7 +903,8 @@ again: victim_name_len = btrfs_inode_ref_name_len(leaf, victim_ref); victim_name = kmalloc(victim_name_len, GFP_NOFS); - BUG_ON(!victim_name); + if (!victim_name) + return -ENOMEM; read_extent_buffer(leaf, victim_name, (unsigned long)(victim_ref + 1), @@ -895,9 +920,10 @@ again: ret = btrfs_unlink_inode(trans, root, dir, inode, victim_name, victim_name_len); - BUG_ON(ret); - btrfs_run_delayed_items(trans, root); kfree(victim_name); + if (ret) + return ret; + btrfs_run_delayed_items(trans, root); *search_done = 1; goto again; } @@ -905,7 +931,6 @@ again: ptr = (unsigned long)(victim_ref + 1) + victim_name_len; } - BUG_ON(ret); /* * NOTE: we have searched root tree and checked the @@ -939,6 +964,8 @@ again: goto next; victim_name = kmalloc(victim_name_len, GFP_NOFS); + if (!victim_name) + return -ENOMEM; read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, victim_name_len); @@ -965,14 +992,16 @@ again: victim_name_len); btrfs_run_delayed_items(trans, root); } - BUG_ON(ret); iput(victim_parent); kfree(victim_name); + if (ret) + return ret; *search_done = 1; goto again; } kfree(victim_name); - BUG_ON(ret); + if (ret) + return ret; next: cur_offset += victim_name_len + sizeof(*extref); } @@ -985,7 +1014,8 @@ next: ref_index, name, namelen, 0); if (di && !IS_ERR(di)) { ret = drop_one_dir_item(trans, root, path, dir, di); - BUG_ON(ret); + if (ret) + return ret; } btrfs_release_path(path); @@ -994,7 +1024,8 @@ next: name, namelen, 0); if (di && !IS_ERR(di)) { ret = drop_one_dir_item(trans, root, path, dir, di); - BUG_ON(ret); + if (ret) + return ret; } btrfs_release_path(path); @@ -1139,15 +1170,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, parent_objectid, ref_index, name, namelen, &search_done); - if (ret == 1) + if (ret == 1) { + ret = 0; + goto out; + } + if (ret) goto out; - BUG_ON(ret); } /* insert our name */ ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, ref_index); - BUG_ON(ret); + if (ret) + goto out; btrfs_update_inode(trans, root, inode); } @@ -1162,13 +1197,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, /* finally write the back reference in the inode */ ret = overwrite_item(trans, root, path, eb, slot, key); - BUG_ON(ret); - out: btrfs_release_path(path); iput(dir); iput(inode); - return 0; + return ret; } static int insert_orphan_item(struct btrfs_trans_handle *trans, @@ -1326,10 +1359,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode)) { ret = replay_dir_deletes(trans, root, NULL, path, ino, 1); - BUG_ON(ret); + if (ret) + goto out; } ret = insert_orphan_item(trans, root, ino); - BUG_ON(ret); } out: @@ -1374,9 +1407,9 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, return -EIO; ret = fixup_inode_link_count(trans, root, inode); - BUG_ON(ret); - iput(inode); + if (ret) + goto out; /* * fixup on a directory may create new entries, @@ -1426,7 +1459,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, } else if (ret == -EEXIST) { ret = 0; } else { - BUG(); + BUG(); /* Logic Error */ } iput(inode); @@ -1495,7 +1528,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct inode *dir; u8 log_type; int exists; - int ret; + int ret = 0; dir = read_one_inode(root, key->objectid); if (!dir) @@ -1527,7 +1560,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, key->offset, name, name_len, 1); } else { - BUG(); + /* Corruption */ + ret = -EINVAL; + goto out; } if (IS_ERR_OR_NULL(dst_di)) { /* we need a sequence number to insert, so we only @@ -1555,7 +1590,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, goto out; ret = drop_one_dir_item(trans, root, path, dir, dst_di); - BUG_ON(ret); + if (ret) + goto out; if (key->type == BTRFS_DIR_INDEX_KEY) goto insert; @@ -1563,14 +1599,15 @@ out: btrfs_release_path(path); kfree(name); iput(dir); - return 0; + return ret; insert: btrfs_release_path(path); ret = insert_one_name(trans, root, path, key->objectid, key->offset, name, name_len, log_type, &log_key); - - BUG_ON(ret && ret != -ENOENT); + if (ret && ret != -ENOENT) + goto out; + ret = 0; goto out; } @@ -1601,7 +1638,8 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); - BUG_ON(ret); + if (ret) + return ret; ptr = (unsigned long)(di + 1); ptr += name_len; } @@ -1762,16 +1800,21 @@ again: ret = link_to_fixup_dir(trans, root, path, location.objectid); - BUG_ON(ret); + if (ret) { + kfree(name); + iput(inode); + goto out; + } + btrfs_inc_nlink(inode); ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); - BUG_ON(ret); - - btrfs_run_delayed_items(trans, root); - + if (!ret) + btrfs_run_delayed_items(trans, root); kfree(name); iput(inode); + if (ret) + goto out; /* there might still be more names under this key * check and repeat if required @@ -1875,7 +1918,8 @@ again: ret = check_item_in_log(trans, root, log, path, log_path, dir, &found_key); - BUG_ON(ret); + if (ret) + goto out; if (found_key.offset == (u64)-1) break; dir_key.offset = found_key.offset + 1; @@ -1952,11 +1996,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (S_ISDIR(mode)) { ret = replay_dir_deletes(wc->trans, root, log, path, key.objectid, 0); - BUG_ON(ret); + if (ret) + break; } ret = overwrite_item(wc->trans, root, path, eb, i, &key); - BUG_ON(ret); + if (ret) + break; /* for regular files, make sure corresponding * orhpan item exist. extents past the new EOF @@ -1965,12 +2011,14 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (S_ISREG(mode)) { ret = insert_orphan_item(wc->trans, root, key.objectid); - BUG_ON(ret); + if (ret) + break; } ret = link_to_fixup_dir(wc->trans, root, path, key.objectid); - BUG_ON(ret); + if (ret) + break; } if (wc->stage < LOG_WALK_REPLAY_ALL) continue; @@ -1979,28 +2027,30 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (key.type == BTRFS_XATTR_ITEM_KEY) { ret = overwrite_item(wc->trans, root, path, eb, i, &key); - BUG_ON(ret); - } else if (key.type == BTRFS_INODE_REF_KEY) { - ret = add_inode_ref(wc->trans, root, log, path, - eb, i, &key); - BUG_ON(ret && ret != -ENOENT); - } else if (key.type == BTRFS_INODE_EXTREF_KEY) { + if (ret) + break; + } else if (key.type == BTRFS_INODE_REF_KEY || + key.type == BTRFS_INODE_EXTREF_KEY) { ret = add_inode_ref(wc->trans, root, log, path, eb, i, &key); - BUG_ON(ret && ret != -ENOENT); + if (ret && ret != -ENOENT) + break; + ret = 0; } else if (key.type == BTRFS_EXTENT_DATA_KEY) { ret = replay_one_extent(wc->trans, root, path, eb, i, &key); - BUG_ON(ret); + if (ret) + break; } else if (key.type == BTRFS_DIR_ITEM_KEY || key.type == BTRFS_DIR_INDEX_KEY) { ret = replay_one_dir_item(wc->trans, root, path, eb, i, &key); - BUG_ON(ret); + if (ret) + break; } } btrfs_free_path(path); - return 0; + return ret; } static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, @@ -2045,8 +2095,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (*level == 1) { ret = wc->process_func(root, next, wc, ptr_gen); - if (ret) + if (ret) { + free_extent_buffer(next); return ret; + } path->slots[*level]++; if (wc->free) { @@ -2066,7 +2118,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_and_pin_reserved_extent(root, bytenr, blocksize); - BUG_ON(ret); /* -ENOMEM or logic errors */ + if (ret) { + free_extent_buffer(next); + return ret; + } } free_extent_buffer(next); continue; @@ -2139,7 +2194,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, ret = btrfs_free_and_pin_reserved_extent(root, path->nodes[*level]->start, path->nodes[*level]->len); - BUG_ON(ret); + if (ret) + return ret; } free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; @@ -2161,7 +2217,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, int wret; int level; struct btrfs_path *path; - int i; int orig_level; path = btrfs_alloc_path(); @@ -2213,17 +2268,12 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_and_pin_reserved_extent(log, next->start, next->len); - BUG_ON(ret); /* -ENOMEM or logic errors */ + if (ret) + goto out; } } out: - for (i = 0; i <= orig_level; i++) { - if (path->nodes[i]) { - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; - } - } btrfs_free_path(path); return ret; } @@ -2316,6 +2366,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; unsigned long log_transid = 0; + struct blk_plug plug; mutex_lock(&root->log_mutex); log_transid = root->log_transid; @@ -2359,8 +2410,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* we start IO on all the marked extents here, but we don't actually * wait for them until later. */ + blk_start_plug(&plug); ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); if (ret) { + blk_finish_plug(&plug); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); @@ -2395,6 +2448,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } if (ret) { + blk_finish_plug(&plug); if (ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); mutex_unlock(&log_root_tree->log_mutex); @@ -2410,6 +2464,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { + blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); @@ -2432,6 +2487,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * check the full commit flag again */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { + blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2439,9 +2495,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out_wake_log_root; } - ret = btrfs_write_and_wait_marked_extents(log_root_tree, - &log_root_tree->dirty_log_pages, - EXTENT_DIRTY | EXTENT_NEW); + ret = btrfs_write_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); + blk_finish_plug(&plug); if (ret) { btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); @@ -2449,6 +2506,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out_wake_log_root; } btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_NEW | EXTENT_DIRTY); btrfs_wait_logged_extents(log, log_transid); btrfs_set_super_log_root(root->fs_info->super_for_commit, @@ -2507,7 +2567,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans, if (trans) { ret = walk_log_tree(trans, log, &wc); - BUG_ON(ret); + + /* I don't think this can happen but just in case */ + if (ret) + btrfs_abort_transaction(trans, log, ret); } while (1) { @@ -2615,7 +2678,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); bytes_del += name_len; - BUG_ON(ret); + if (ret) { + err = ret; + goto fail; + } } btrfs_release_path(path); di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, @@ -2627,7 +2693,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); bytes_del += name_len; - BUG_ON(ret); + if (ret) { + err = ret; + goto fail; + } } /* update the directory size in the log to reflect the names @@ -2966,7 +3035,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, log, &key, path, -1, 1); - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Logic error */ if (ret < 0) break; @@ -3169,7 +3238,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, log->fs_info->csum_root, ds + cs, ds + cs + cl - 1, &ordered_sums, 0); - BUG_ON(ret); + if (ret) { + btrfs_release_path(dst_path); + kfree(ins_data); + return ret; + } } } } @@ -3209,115 +3282,6 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) return 0; } -static int drop_adjacent_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - struct extent_map *em, - struct btrfs_path *path) -{ - struct btrfs_file_extent_item *fi; - struct extent_buffer *leaf; - struct btrfs_key key, new_key; - struct btrfs_map_token token; - u64 extent_end; - u64 extent_offset = 0; - int extent_type; - int del_slot = 0; - int del_nr = 0; - int ret = 0; - - while (1) { - btrfs_init_map_token(&token); - leaf = path->nodes[0]; - path->slots[0]++; - if (path->slots[0] >= btrfs_header_nritems(leaf)) { - if (del_nr) { - ret = btrfs_del_items(trans, root, path, - del_slot, del_nr); - if (ret) - return ret; - del_nr = 0; - } - - ret = btrfs_next_leaf_write(trans, root, path, 1); - if (ret < 0) - return ret; - if (ret > 0) - return 0; - leaf = path->nodes[0]; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != btrfs_ino(inode) || - key.type != BTRFS_EXTENT_DATA_KEY || - key.offset >= em->start + em->len) - break; - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - extent_type = btrfs_token_file_extent_type(leaf, fi, &token); - if (extent_type == BTRFS_FILE_EXTENT_REG || - extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - extent_offset = btrfs_token_file_extent_offset(leaf, - fi, &token); - extent_end = key.offset + - btrfs_token_file_extent_num_bytes(leaf, fi, - &token); - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - extent_end = key.offset + - btrfs_file_extent_inline_len(leaf, fi); - } else { - BUG(); - } - - if (extent_end <= em->len + em->start) { - if (!del_nr) { - del_slot = path->slots[0]; - } - del_nr++; - continue; - } - - /* - * Ok so we'll ignore previous items if we log a new extent, - * which can lead to overlapping extents, so if we have an - * existing extent we want to adjust we _have_ to check the next - * guy to make sure we even need this extent anymore, this keeps - * us from panicing in set_item_key_safe. - */ - if (path->slots[0] < btrfs_header_nritems(leaf) - 1) { - struct btrfs_key tmp_key; - - btrfs_item_key_to_cpu(leaf, &tmp_key, - path->slots[0] + 1); - if (tmp_key.objectid == btrfs_ino(inode) && - tmp_key.type == BTRFS_EXTENT_DATA_KEY && - tmp_key.offset <= em->start + em->len) { - if (!del_nr) - del_slot = path->slots[0]; - del_nr++; - continue; - } - } - - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); - memcpy(&new_key, &key, sizeof(new_key)); - new_key.offset = em->start + em->len; - btrfs_set_item_key_safe(trans, root, path, &new_key); - extent_offset += em->start + em->len - key.offset; - btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, - &token); - btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end - - (em->start + em->len), - &token); - btrfs_mark_buffer_dirty(leaf); - } - - if (del_nr) - ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - - return ret; -} - static int log_one_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_root *root, struct extent_map *em, struct btrfs_path *path) @@ -3339,39 +3303,24 @@ static int log_one_extent(struct btrfs_trans_handle *trans, int index = log->log_transid % 2; bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; -insert: + ret = __btrfs_drop_extents(trans, log, inode, path, em->start, + em->start + em->len, NULL, 0); + if (ret) + return ret; + INIT_LIST_HEAD(&ordered_sums); btrfs_init_map_token(&token); key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = em->start; - path->really_keep_locks = 1; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); - if (ret && ret != -EEXIST) { - path->really_keep_locks = 0; + if (ret) return ret; - } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - /* - * If we are overwriting an inline extent with a real one then we need - * to just delete the inline extent as it may not be large enough to - * have the entire file_extent_item. - */ - if (ret && btrfs_token_file_extent_type(leaf, fi, &token) == - BTRFS_FILE_EXTENT_INLINE) { - ret = btrfs_del_item(trans, log, path); - btrfs_release_path(path); - if (ret) { - path->really_keep_locks = 0; - return ret; - } - goto insert; - } - btrfs_set_token_file_extent_generation(leaf, fi, em->generation, &token); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { @@ -3410,22 +3359,14 @@ insert: em->start - em->orig_start, &token); btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); - btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token); + btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, &token); btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); btrfs_mark_buffer_dirty(leaf); - /* - * Have to check the extent to the right of us to make sure it doesn't - * fall in our current range. We're ok if the previous extent is in our - * range since the recovery stuff will run us in key order and thus just - * drop the part we overwrote. - */ - ret = drop_adjacent_extents(trans, log, inode, em, path); btrfs_release_path(path); - path->really_keep_locks = 0; if (ret) { return ret; } @@ -3650,8 +3591,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, bool fast_search = false; u64 ino = btrfs_ino(inode); - log = root->log_root; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3918,9 +3857,9 @@ out: * only logging is done of any parent directories that are older than * the last committed transaction */ -int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only) +static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct dentry *parent, int exists_only) { int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; @@ -4095,8 +4034,7 @@ again: if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) break; - log = btrfs_read_fs_root_no_radix(log_root_tree, - &found_key); + log = btrfs_read_fs_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); btrfs_error(fs_info, ret, @@ -4111,6 +4049,9 @@ again: wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); + free_extent_buffer(log->node); + free_extent_buffer(log->commit_root); + kfree(log); btrfs_error(fs_info, ret, "Couldn't read target root " "for tree log recovery."); goto error; @@ -4119,12 +4060,10 @@ again: wc.replay_dest->log_root = log; btrfs_record_root_in_trans(trans, wc.replay_dest); ret = walk_log_tree(trans, log, &wc); - BUG_ON(ret); - if (wc.stage == LOG_WALK_REPLAY_ALL) { + if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { ret = fixup_inode_link_counts(trans, wc.replay_dest, path); - BUG_ON(ret); } key.offset = found_key.offset - 1; @@ -4133,6 +4072,9 @@ again: free_extent_buffer(log->commit_root); kfree(log); + if (ret) + goto error; + if (found_key.offset == 0) break; } @@ -4153,17 +4095,20 @@ again: btrfs_free_path(path); + /* step 4: commit the transaction, which also unpins the blocks */ + ret = btrfs_commit_transaction(trans, fs_info->tree_root); + if (ret) + return ret; + free_extent_buffer(log_root_tree->node); log_root_tree->log_root = NULL; fs_info->log_root_recovering = 0; - - /* step 4: commit the transaction, which also unpins the blocks */ - btrfs_commit_transaction(trans, fs_info->tree_root); - kfree(log_root_tree); - return 0; + return 0; error: + if (wc.trans) + btrfs_end_transaction(wc.trans, fs_info->tree_root); btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 862ac813f6b8..1d4ae0d15a70 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -40,9 +40,6 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); int btrfs_pin_log_trans(struct btrfs_root *root); -int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct inode *dir, struct inode *inode, int for_rename); diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index ddc61cad0080..b0a523b2c60e 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -53,6 +53,7 @@ void ulist_init(struct ulist *ulist) ulist->nnodes = 0; ulist->nodes = ulist->int_nodes; ulist->nodes_alloced = ULIST_SIZE; + ulist->root = RB_ROOT; } EXPORT_SYMBOL(ulist_init); @@ -72,6 +73,7 @@ void ulist_fini(struct ulist *ulist) if (ulist->nodes_alloced > ULIST_SIZE) kfree(ulist->nodes); ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */ + ulist->root = RB_ROOT; } EXPORT_SYMBOL(ulist_fini); @@ -123,6 +125,45 @@ void ulist_free(struct ulist *ulist) } EXPORT_SYMBOL(ulist_free); +static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) +{ + struct rb_node *n = ulist->root.rb_node; + struct ulist_node *u = NULL; + + while (n) { + u = rb_entry(n, struct ulist_node, rb_node); + if (u->val < val) + n = n->rb_right; + else if (u->val > val) + n = n->rb_left; + else + return u; + } + return NULL; +} + +static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) +{ + struct rb_node **p = &ulist->root.rb_node; + struct rb_node *parent = NULL; + struct ulist_node *cur = NULL; + + while (*p) { + parent = *p; + cur = rb_entry(parent, struct ulist_node, rb_node); + + if (cur->val < ins->val) + p = &(*p)->rb_right; + else if (cur->val > ins->val) + p = &(*p)->rb_left; + else + return -EEXIST; + } + rb_link_node(&ins->rb_node, parent, p); + rb_insert_color(&ins->rb_node, &ulist->root); + return 0; +} + /** * ulist_add - add an element to the ulist * @ulist: ulist to add the element to @@ -151,20 +192,23 @@ int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask) int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 *old_aux, gfp_t gfp_mask) { - int i; - - for (i = 0; i < ulist->nnodes; ++i) { - if (ulist->nodes[i].val == val) { - if (old_aux) - *old_aux = ulist->nodes[i].aux; - return 0; - } + int ret = 0; + struct ulist_node *node = NULL; + node = ulist_rbtree_search(ulist, val); + if (node) { + if (old_aux) + *old_aux = node->aux; + return 0; } if (ulist->nnodes >= ulist->nodes_alloced) { u64 new_alloced = ulist->nodes_alloced + 128; struct ulist_node *new_nodes; void *old = NULL; + int i; + + for (i = 0; i < ulist->nnodes; i++) + rb_erase(&ulist->nodes[i].rb_node, &ulist->root); /* * if nodes_alloced == ULIST_SIZE no memory has been allocated @@ -184,9 +228,22 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, ulist->nodes = new_nodes; ulist->nodes_alloced = new_alloced; + + /* + * krealloc actually uses memcpy, which does not copy rb_node + * pointers, so we have to do it ourselves. Otherwise we may + * be bitten by crashes. + */ + for (i = 0; i < ulist->nnodes; i++) { + ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]); + if (ret < 0) + return ret; + } } ulist->nodes[ulist->nnodes].val = val; ulist->nodes[ulist->nnodes].aux = aux; + ret = ulist_rbtree_insert(ulist, &ulist->nodes[ulist->nnodes]); + BUG_ON(ret); ++ulist->nnodes; return 1; diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 21a1963439c3..fb36731074b5 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -8,6 +8,9 @@ #ifndef __ULIST__ #define __ULIST__ +#include <linux/list.h> +#include <linux/rbtree.h> + /* * ulist is a generic data structure to hold a collection of unique u64 * values. The only operations it supports is adding to the list and @@ -34,6 +37,7 @@ struct ulist_iterator { struct ulist_node { u64 val; /* value to store */ u64 aux; /* auxiliary value saved along with the val */ + struct rb_node rb_node; /* used to speed up search */ }; struct ulist { @@ -54,6 +58,8 @@ struct ulist { */ struct ulist_node *nodes; + struct rb_root root; + /* * inline storage space for the first ULIST_SIZE entries */ diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h deleted file mode 100644 index 9bf3946d5ef2..000000000000 --- a/fs/btrfs/version.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef __BTRFS_VERSION_H -#define __BTRFS_VERSION_H -#define BTRFS_BUILD_VERSION "Btrfs" -#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2854c824ab64..78b871753cb6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -46,6 +46,7 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_device *device); static int btrfs_relocate_sys_chunks(struct btrfs_root *root); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); +static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static DEFINE_MUTEX(uuid_mutex); @@ -717,9 +718,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, if (!device->name) continue; - ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev, &bh); - if (ret) + /* Just open everything we can; ignore failures here */ + if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, + &bdev, &bh)) continue; disk_super = (struct btrfs_super_block *)bh->b_data; @@ -981,6 +982,35 @@ out: return ret; } +static int contains_pending_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 *start, u64 len) +{ + struct extent_map *em; + int ret = 0; + + list_for_each_entry(em, &trans->transaction->pending_chunks, list) { + struct map_lookup *map; + int i; + + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) { + if (map->stripes[i].dev != device) + continue; + if (map->stripes[i].physical >= *start + len || + map->stripes[i].physical + em->orig_block_len <= + *start) + continue; + *start = map->stripes[i].physical + + em->orig_block_len; + ret = 1; + } + } + + return ret; +} + + /* * find_free_dev_extent - find free space in the specified device * @device: the device which we search the free space in @@ -1001,7 +1031,8 @@ out: * But if we don't find suitable free space, it is used to store the size of * the max free space. */ -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { struct btrfs_key key; @@ -1025,21 +1056,22 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, */ search_start = max(root->fs_info->alloc_start, 1024ull * 1024); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +again: max_hole_start = search_start; max_hole_size = 0; hole_size = 0; if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { ret = -ENOSPC; - goto error; + goto out; } - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto error; - } path->reada = 2; + path->search_commit_root = 1; + path->skip_locking = 1; key.objectid = device->devid; key.offset = search_start; @@ -1080,6 +1112,15 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, if (key.offset > search_start) { hole_size = key.offset - search_start; + /* + * Have to check before we set max_hole_start, otherwise + * we could end up sending back this offset anyway. + */ + if (contains_pending_extent(trans, device, + &search_start, + hole_size)) + hole_size = 0; + if (hole_size > max_hole_size) { max_hole_start = search_start; max_hole_size = hole_size; @@ -1123,6 +1164,11 @@ next: max_hole_size = hole_size; } + if (contains_pending_extent(trans, device, &search_start, hole_size)) { + btrfs_release_path(path); + goto again; + } + /* See above. */ if (hole_size < num_bytes) ret = -ENOSPC; @@ -1131,7 +1177,6 @@ next: out: btrfs_free_path(path); -error: *start = max_hole_start; if (len) *len = max_hole_size; @@ -1199,10 +1244,10 @@ out: return ret; } -int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset, u64 start, u64 num_bytes) +static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 start, u64 num_bytes) { int ret; struct btrfs_path *path; @@ -1243,47 +1288,22 @@ out: return ret; } -static noinline int find_next_chunk(struct btrfs_root *root, - u64 objectid, u64 *offset) +static u64 find_next_chunk(struct btrfs_fs_info *fs_info) { - struct btrfs_path *path; - int ret; - struct btrfs_key key; - struct btrfs_chunk *chunk; - struct btrfs_key found_key; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = objectid; - key.offset = (u64)-1; - key.type = BTRFS_CHUNK_ITEM_KEY; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto error; - - BUG_ON(ret == 0); /* Corruption */ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct rb_node *n; + u64 ret = 0; - ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); - if (ret) { - *offset = 0; - } else { - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); - if (found_key.objectid != objectid) - *offset = 0; - else { - chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_chunk); - *offset = found_key.offset + - btrfs_chunk_length(path->nodes[0], chunk); - } + em_tree = &fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + n = rb_last(&em_tree->map); + if (n) { + em = rb_entry(n, struct extent_map, rb_node); + ret = em->start + em->len; } - ret = 0; -error: - btrfs_free_path(path); + read_unlock(&em_tree->lock); + return ret; } @@ -1329,9 +1349,9 @@ error: * the device information is stored in the chunk root * the btrfs_device struct should be fully filled in */ -int btrfs_add_device(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_device *device) +static int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) { int ret; struct btrfs_path *path; @@ -1461,31 +1481,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) btrfs_dev_replace_unlock(&root->fs_info->dev_replace); if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { - printk(KERN_ERR "btrfs: unable to go below four devices " - "on raid10\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; goto out; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { - printk(KERN_ERR "btrfs: unable to go below two " - "devices on raid1\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; goto out; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && root->fs_info->fs_devices->rw_devices <= 2) { - printk(KERN_ERR "btrfs: unable to go below two " - "devices on raid5\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; goto out; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && root->fs_info->fs_devices->rw_devices <= 3) { - printk(KERN_ERR "btrfs: unable to go below three " - "devices on raid6\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; goto out; } @@ -1511,8 +1523,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) bh = NULL; disk_super = NULL; if (!device) { - printk(KERN_ERR "btrfs: no missing devices found to " - "remove\n"); + ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; goto out; } } else { @@ -1534,15 +1545,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } if (device->is_tgtdev_for_dev_replace) { - pr_err("btrfs: unable to remove the dev_replace target dev\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_TGT_REPLACE; goto error_brelse; } if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { - printk(KERN_ERR "btrfs: unable to remove the only writeable " - "device\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; goto error_brelse; } @@ -1710,8 +1718,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->fs_devices->device_list_mutex); } -int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, - struct btrfs_device **device) +static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, + struct btrfs_device **device) { int ret = 0; struct btrfs_super_block *disk_super; @@ -3119,14 +3127,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl, allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; if (num_devices == 1) allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (num_devices < 4) + else if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); - else - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6); - + if (num_devices > 2) + allowed |= BTRFS_BLOCK_GROUP_RAID5; + if (num_devices > 3) + allowed |= (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID6); if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->data.target, 1) || (bctl->data.target & ~allowed))) { @@ -3295,10 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) } tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); - if (IS_ERR(tsk)) - return PTR_ERR(tsk); - - return 0; + return PTR_RET(tsk); } int btrfs_recover_balance(struct btrfs_fs_info *fs_info) @@ -3607,7 +3611,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b) return 0; } -struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { +static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, .dev_stripes = 1, @@ -3674,25 +3678,15 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) { - u64 features; - if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) return; - features = btrfs_super_incompat_flags(info->super_copy); - if (features & BTRFS_FEATURE_INCOMPAT_RAID56) - return; - - features |= BTRFS_FEATURE_INCOMPAT_RAID56; - btrfs_set_super_incompat_flags(info->super_copy, features); - printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n"); + btrfs_set_fs_incompat(info, RAID56); } static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct map_lookup **map_ret, - u64 *num_bytes_out, u64 *stripe_size_out, - u64 start, u64 type) + struct btrfs_root *extent_root, u64 start, + u64 type) { struct btrfs_fs_info *info = extent_root->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; @@ -3799,7 +3793,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (total_avail == 0) continue; - ret = find_free_dev_extent(device, + ret = find_free_dev_extent(trans, device, max_stripe_size * dev_stripes, &dev_offset, &max_avail); if (ret && ret != -ENOSPC) @@ -3911,12 +3905,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, map->type = type; map->sub_stripes = sub_stripes; - *map_ret = map; num_bytes = stripe_size * data_stripes; - *stripe_size_out = stripe_size; - *num_bytes_out = num_bytes; - trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); em = alloc_extent_map(); @@ -3929,38 +3919,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em->len = num_bytes; em->block_start = 0; em->block_len = em->len; + em->orig_block_len = stripe_size; em_tree = &extent_root->fs_info->mapping_tree.map_tree; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); + ret = add_extent_mapping(em_tree, em, 0); + if (!ret) { + list_add_tail(&em->list, &trans->transaction->pending_chunks); + atomic_inc(&em->refs); + } write_unlock(&em_tree->lock); if (ret) { free_extent_map(em); goto error; } - for (i = 0; i < map->num_stripes; ++i) { - struct btrfs_device *device; - u64 dev_offset; - - device = map->stripes[i].dev; - dev_offset = map->stripes[i].physical; - - ret = btrfs_alloc_dev_extent(trans, device, - info->chunk_root->root_key.objectid, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, - start, dev_offset, stripe_size); - if (ret) - goto error_dev_extent; - } - ret = btrfs_make_block_group(trans, extent_root, 0, type, BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes); - if (ret) { - i = map->num_stripes - 1; - goto error_dev_extent; - } + if (ret) + goto error_del_extent; free_extent_map(em); check_raid56_incompat_flag(extent_root->fs_info, type); @@ -3968,18 +3946,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, kfree(devices_info); return 0; -error_dev_extent: - for (; i >= 0; i--) { - struct btrfs_device *device; - int err; - - device = map->stripes[i].dev; - err = btrfs_free_dev_extent(trans, device, start); - if (err) { - btrfs_abort_transaction(trans, extent_root, err); - break; - } - } +error_del_extent: write_lock(&em_tree->lock); remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); @@ -3994,33 +3961,68 @@ error: return ret; } -static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, +int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, - struct map_lookup *map, u64 chunk_offset, - u64 chunk_size, u64 stripe_size) + u64 chunk_offset, u64 chunk_size) { - u64 dev_offset; struct btrfs_key key; struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; struct btrfs_device *device; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; - size_t item_size = btrfs_chunk_item_size(map->num_stripes); - int index = 0; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct map_lookup *map; + size_t item_size; + u64 dev_offset; + u64 stripe_size; + int i = 0; int ret; + em_tree = &extent_root->fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); + read_unlock(&em_tree->lock); + + if (!em) { + btrfs_crit(extent_root->fs_info, "unable to find logical " + "%Lu len %Lu", chunk_offset, chunk_size); + return -EINVAL; + } + + if (em->start != chunk_offset || em->len != chunk_size) { + btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" + " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset, + chunk_size, em->start, em->len); + free_extent_map(em); + return -EINVAL; + } + + map = (struct map_lookup *)em->bdev; + item_size = btrfs_chunk_item_size(map->num_stripes); + stripe_size = em->orig_block_len; + chunk = kzalloc(item_size, GFP_NOFS); - if (!chunk) - return -ENOMEM; + if (!chunk) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; - index = 0; - while (index < map->num_stripes) { - device = map->stripes[index].dev; device->bytes_used += stripe_size; ret = btrfs_update_device(trans, device); if (ret) - goto out_free; - index++; + goto out; + ret = btrfs_alloc_dev_extent(trans, device, + chunk_root->root_key.objectid, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + chunk_offset, dev_offset, + stripe_size); + if (ret) + goto out; } spin_lock(&extent_root->fs_info->free_chunk_lock); @@ -4028,17 +4030,15 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, map->num_stripes); spin_unlock(&extent_root->fs_info->free_chunk_lock); - index = 0; stripe = &chunk->stripe; - while (index < map->num_stripes) { - device = map->stripes[index].dev; - dev_offset = map->stripes[index].physical; + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; btrfs_set_stack_stripe_devid(stripe, device->devid); btrfs_set_stack_stripe_offset(stripe, dev_offset); memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); stripe++; - index++; } btrfs_set_stack_chunk_length(chunk, chunk_size); @@ -4056,7 +4056,6 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, key.offset = chunk_offset; ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); - if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { /* * TODO: Cleanup of inserted chunk root in case of @@ -4066,8 +4065,9 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, item_size); } -out_free: +out: kfree(chunk); + free_extent_map(em); return ret; } @@ -4082,27 +4082,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 type) { u64 chunk_offset; - u64 chunk_size; - u64 stripe_size; - struct map_lookup *map; - struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; - int ret; - - ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, - &chunk_offset); - if (ret) - return ret; - ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, - &stripe_size, chunk_offset, type); - if (ret) - return ret; - - ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, - chunk_size, stripe_size); - if (ret) - return ret; - return 0; + chunk_offset = find_next_chunk(extent_root->fs_info); + return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type); } static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, @@ -4111,66 +4093,31 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, { u64 chunk_offset; u64 sys_chunk_offset; - u64 chunk_size; - u64 sys_chunk_size; - u64 stripe_size; - u64 sys_stripe_size; u64 alloc_profile; - struct map_lookup *map; - struct map_lookup *sys_map; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = fs_info->extent_root; int ret; - ret = find_next_chunk(fs_info->chunk_root, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); - if (ret) - return ret; - + chunk_offset = find_next_chunk(fs_info); alloc_profile = btrfs_get_alloc_profile(extent_root, 0); - ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, - &stripe_size, chunk_offset, alloc_profile); + ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset, + alloc_profile); if (ret) return ret; - sys_chunk_offset = chunk_offset + chunk_size; - + sys_chunk_offset = find_next_chunk(root->fs_info); alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); - ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, - &sys_chunk_size, &sys_stripe_size, - sys_chunk_offset, alloc_profile); + ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, + alloc_profile); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out; } ret = btrfs_add_device(trans, fs_info->chunk_root, device); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - - /* - * Modifying chunk tree needs allocating new blocks from both - * system block group and metadata block group. So we only can - * do operations require modifying the chunk tree after both - * block groups were created. - */ - ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, - chunk_size, stripe_size); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - - ret = __finish_chunk_alloc(trans, extent_root, sys_map, - sys_chunk_offset, sys_chunk_size, - sys_stripe_size); if (ret) btrfs_abort_transaction(trans, root, ret); - out: - return ret; } @@ -4240,9 +4187,25 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, len); read_unlock(&em_tree->lock); - BUG_ON(!em); - BUG_ON(em->start > logical || em->start + em->len < logical); + /* + * We could return errors for these cases, but that could get ugly and + * we'd probably do the same thing which is just not do anything else + * and exit, so return 1 so the callers don't try to use other copies. + */ + if (!em) { + btrfs_emerg(fs_info, "No mapping for %Lu-%Lu\n", logical, + logical+len); + return 1; + } + + if (em->start > logical || em->start + em->len < logical) { + btrfs_emerg(fs_info, "Invalid mapping for %Lu-%Lu, got " + "%Lu-%Lu\n", logical, logical+len, em->start, + em->start + em->len); + return 1; + } + map = (struct map_lookup *)em->bdev; if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) ret = map->num_stripes; @@ -4411,19 +4374,22 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, read_unlock(&em_tree->lock); if (!em) { - printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n", - (unsigned long long)logical, - (unsigned long long)*length); - BUG(); + btrfs_crit(fs_info, "unable to find logical %llu len %llu", + (unsigned long long)logical, + (unsigned long long)*length); + return -EINVAL; + } + + if (em->start > logical || em->start + em->len < logical) { + btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " + "found %Lu-%Lu\n", logical, em->start, + em->start + em->len); + return -EINVAL; } - BUG_ON(em->start > logical || em->start + em->len < logical); map = (struct map_lookup *)em->bdev; offset = logical - em->start; - if (mirror_num > map->num_stripes) - mirror_num = 0; - stripe_len = map->stripe_len; stripe_nr = offset; /* @@ -5004,42 +4970,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } -static void *merge_stripe_index_into_bio_private(void *bi_private, - unsigned int stripe_index) -{ - /* - * with single, dup, RAID0, RAID1 and RAID10, stripe_index is - * at most 1. - * The alternative solution (instead of stealing bits from the - * pointer) would be to allocate an intermediate structure - * that contains the old private pointer plus the stripe_index. - */ - BUG_ON((((uintptr_t)bi_private) & 3) != 0); - BUG_ON(stripe_index > 3); - return (void *)(((uintptr_t)bi_private) | stripe_index); -} - -static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private) -{ - return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3)); -} - -static unsigned int extract_stripe_index_from_bio_private(void *bi_private) -{ - return (unsigned int)((uintptr_t)bi_private) & 3; -} - static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private); + struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; if (err) { atomic_inc(&bbio->error); if (err == -EIO || err == -EREMOTEIO) { unsigned int stripe_index = - extract_stripe_index_from_bio_private( - bio->bi_private); + btrfs_io_bio(bio)->stripe_index; struct btrfs_device *dev; BUG_ON(stripe_index >= bbio->num_stripes); @@ -5069,8 +5009,7 @@ static void btrfs_end_bio(struct bio *bio, int err) } bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; - bio->bi_bdev = (struct block_device *) - (unsigned long)bbio->mirror_num; + btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the btrfs bio */ @@ -5106,9 +5045,9 @@ struct async_sched { * This will add one bio to the pending list for a device and make sure * the work struct is scheduled. */ -noinline void btrfs_schedule_bio(struct btrfs_root *root, - struct btrfs_device *device, - int rw, struct bio *bio) +static noinline void btrfs_schedule_bio(struct btrfs_root *root, + struct btrfs_device *device, + int rw, struct bio *bio) { int should_queue = 1; struct btrfs_pending_bios *pending_bios; @@ -5177,7 +5116,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio, } prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if ((bio->bi_size >> 9) > max_sectors) + if (bio_sectors(bio) > max_sectors) return 0; if (!q->merge_bvec_fn) @@ -5196,8 +5135,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, struct btrfs_device *dev = bbio->stripes[dev_nr].dev; bio->bi_private = bbio; - bio->bi_private = merge_stripe_index_into_bio_private( - bio->bi_private, (unsigned int)dev_nr); + btrfs_io_bio(bio)->stripe_index = dev_nr; bio->bi_end_io = btrfs_end_bio; bio->bi_sector = physical >> 9; #ifdef DEBUG @@ -5258,8 +5196,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) if (atomic_dec_and_test(&bbio->stripes_pending)) { bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; - bio->bi_bdev = (struct block_device *) - (unsigned long)bbio->mirror_num; + btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_sector = logical >> 9; kfree(bbio); bio_endio(bio, -EIO); @@ -5308,10 +5245,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } if (map_length < length) { - printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " - "len %llu\n", (unsigned long long)logical, - (unsigned long long)length, - (unsigned long long)map_length); + btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu", + (unsigned long long)logical, + (unsigned long long)length, + (unsigned long long)map_length); BUG(); } @@ -5337,7 +5274,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } if (dev_nr < total_devs - 1) { - bio = bio_clone(first_bio, GFP_NOFS); + bio = btrfs_bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); /* -ENOMEM */ } else { bio = first_bio; @@ -5382,7 +5319,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root, return NULL; list_add(&device->dev_list, &fs_devices->devices); - device->dev_root = root->fs_info->dev_root; device->devid = devid; device->work.func = pending_bios_fn; device->fs_devices = fs_devices; @@ -5476,7 +5412,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, } write_lock(&map_tree->map_tree.lock); - ret = add_extent_mapping(&map_tree->map_tree, em); + ret = add_extent_mapping(&map_tree->map_tree, em, 0); write_unlock(&map_tree->map_tree.lock); BUG_ON(ret); /* Tree corruption */ free_extent_map(em); @@ -5583,8 +5519,8 @@ static int read_one_dev(struct btrfs_root *root, return -EIO; if (!device) { - printk(KERN_WARNING "warning devid %llu missing\n", - (unsigned long long)devid); + btrfs_warn(root->fs_info, "devid %llu missing", + (unsigned long long)devid); device = add_missing_dev(root, devid, dev_uuid); if (!device) return -ENOMEM; @@ -5608,7 +5544,6 @@ static int read_one_dev(struct btrfs_root *root, } fill_device_from_item(leaf, dev_item, device); - device->dev_root = root->fs_info->dev_root; device->in_fs_metadata = 1; if (device->writeable && !device->is_tgtdev_for_dev_replace) { device->fs_devices->total_rw_bytes += device->total_bytes; @@ -5766,6 +5701,17 @@ error: return ret; } +void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->dev_root = fs_info->dev_root; + mutex_unlock(&fs_devices->device_list_mutex); +} + static void __btrfs_reset_dev_stats(struct btrfs_device *dev) { int i; @@ -5926,7 +5872,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) btrfs_dev_stat_print_on_error(dev); } -void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) +static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) { if (!dev->dev_stats_valid) return; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 062d8604d35b..86705583480d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -152,6 +152,26 @@ struct btrfs_fs_devices { int rotating; }; +/* + * we need the mirror number and stripe index to be passed around + * the call chain while we are processing end_io (especially errors). + * Really, what we need is a btrfs_bio structure that has this info + * and is properly sized with its stripe array, but we're not there + * quite yet. We have our own btrfs bioset, and all of the bios + * we allocate are actually btrfs_io_bios. We'll cram as much of + * struct btrfs_bio as we can into this over time. + */ +struct btrfs_io_bio { + unsigned long mirror_num; + unsigned long stripe_index; + struct bio bio; +}; + +static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio) +{ + return container_of(bio, struct btrfs_io_bio, bio); +} + struct btrfs_bio_stripe { struct btrfs_device *dev; u64 physical; @@ -254,10 +274,6 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, #define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ (sizeof(struct btrfs_bio_stripe) * (n))) -int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset, u64 start, u64 num_bytes); int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); @@ -282,11 +298,6 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); -int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, - struct btrfs_device **device); -int btrfs_add_device(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_device *device); int btrfs_rm_device(struct btrfs_root *root, char *device_path); void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); @@ -305,12 +316,13 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); -void btrfs_dev_stat_print_on_error(struct btrfs_device *device); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_root *root, struct btrfs_ioctl_get_dev_stats *stats); +void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); @@ -321,14 +333,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); int btrfs_scratch_superblock(struct btrfs_device *device); -void btrfs_schedule_bio(struct btrfs_root *root, - struct btrfs_device *device, - int rw, struct bio *bio); int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len, int mirror_num); unsigned long btrfs_full_stripe_len(struct btrfs_root *root, struct btrfs_mapping_tree *map_tree, u64 logical); +int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + u64 chunk_offset, u64 chunk_size); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) { diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 446a6848c554..05740b9789e4 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -406,8 +406,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) XATTR_REPLACE); } -int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int btrfs_initxattrs(struct inode *inode, + const struct xattr *xattr_array, void *fs_info) { const struct xattr *xattr; struct btrfs_trans_handle *trans = fs_info; diff --git a/fs/buffer.c b/fs/buffer.c index b4dcb34c9635..4d7433534f5c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -83,6 +83,40 @@ void unlock_buffer(struct buffer_head *bh) EXPORT_SYMBOL(unlock_buffer); /* + * Returns if the page has dirty or writeback buffers. If all the buffers + * are unlocked and clean then the PageDirty information is stale. If + * any of the pages are locked, it is assumed they are locked for IO. + */ +void buffer_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct buffer_head *head, *bh; + *dirty = false; + *writeback = false; + + BUG_ON(!PageLocked(page)); + + if (!page_has_buffers(page)) + return; + + if (PageWriteback(page)) + *writeback = true; + + head = page_buffers(page); + bh = head; + do { + if (buffer_locked(bh)) + *writeback = true; + + if (buffer_dirty(bh)) + *dirty = true; + + bh = bh->b_this_page; + } while (bh != head); +} +EXPORT_SYMBOL(buffer_check_dirty_writeback); + +/* * Block until a buffer comes unlocked. This doesn't stop it * from becoming locked again - you have to lock it yourself * if you want to preserve its state. @@ -865,8 +899,6 @@ try_again: /* Link the buffer to its page */ set_bh_page(bh, page, offset); - - init_buffer(bh, NULL, NULL); } return head; /* @@ -1456,7 +1488,8 @@ static void discard_buffer(struct buffer_head * bh) * block_invalidatepage - invalidate part or all of a buffer-backed page * * @page: the page which is affected - * @offset: the index of the truncation point + * @offset: start of the range to invalidate + * @length: length of the range to invalidate * * block_invalidatepage() is called when all or part of the page has become * invalidated by a truncate operation. @@ -1467,15 +1500,22 @@ static void discard_buffer(struct buffer_head * bh) * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ -void block_invalidatepage(struct page *page, unsigned long offset) +void block_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct buffer_head *head, *bh, *next; unsigned int curr_off = 0; + unsigned int stop = length + offset; BUG_ON(!PageLocked(page)); if (!page_has_buffers(page)) goto out; + /* + * Check for overflow + */ + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + head = page_buffers(page); bh = head; do { @@ -1483,6 +1523,12 @@ void block_invalidatepage(struct page *page, unsigned long offset) next = bh->b_this_page; /* + * Are we still fully in range ? + */ + if (next_off > stop) + goto out; + + /* * is this block fully invalidated? */ if (offset <= curr_off) @@ -1503,6 +1549,7 @@ out: } EXPORT_SYMBOL(block_invalidatepage); + /* * We attach and possibly dirty the buffers atomically wrt * __set_page_dirty_buffers() via private_lock. try_to_free_buffers @@ -2843,7 +2890,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, * they may have been added in ext3_writepage(). Make them * freeable here, so the page does not leak. */ - do_invalidatepage(page, 0); + do_invalidatepage(page, 0, PAGE_CACHE_SIZE); unlock_page(page); return 0; /* don't care */ } @@ -2949,7 +2996,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) } } -int submit_bh(int rw, struct buffer_head * bh) +int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) { struct bio *bio; int ret = 0; @@ -2979,15 +3026,20 @@ int submit_bh(int rw, struct buffer_head * bh) bio->bi_io_vec[0].bv_offset = bh_offset(bh); bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = bh->b_size; bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; + bio->bi_flags |= bio_flags; /* Take care of bh's that straddle the end of the device */ guard_bh_eod(rw, bio, bh); + if (buffer_meta(bh)) + rw |= REQ_META; + if (buffer_prio(bh)) + rw |= REQ_PRIO; + bio_get(bio); submit_bio(rw, bio); @@ -2997,6 +3049,12 @@ int submit_bh(int rw, struct buffer_head * bh) bio_put(bio); return ret; } +EXPORT_SYMBOL_GPL(_submit_bh); + +int submit_bh(int rw, struct buffer_head *bh) +{ + return _submit_bh(rw, bh, 0); +} EXPORT_SYMBOL(submit_bh); /** diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 746ce532e130..d4c1206af9fc 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -13,8 +13,6 @@ #include <linux/mount.h> #include "internal.h" -#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) - struct cachefiles_lookup_data { struct cachefiles_xattr *auxdata; /* auxiliary data */ char *key; /* key path */ @@ -212,20 +210,29 @@ static void cachefiles_update_object(struct fscache_object *_object) object = container_of(_object, struct cachefiles_object, fscache); cache = container_of(object->fscache.cache, struct cachefiles_cache, cache); + + if (!fscache_use_cookie(_object)) { + _leave(" [relinq]"); + return; + } + cookie = object->fscache.cookie; if (!cookie->def->get_aux) { + fscache_unuse_cookie(_object); _leave(" [no aux]"); return; } auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp); if (!auxdata) { + fscache_unuse_cookie(_object); _leave(" [nomem]"); return; } auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511); + fscache_unuse_cookie(_object); ASSERTCMP(auxlen, <, 511); auxdata->len = auxlen + 1; @@ -263,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object) #endif /* delete retired objects */ - if (object->fscache.state == FSCACHE_OBJECT_RECYCLING && + if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) && _object != cache->cache.fsdef ) { _debug("- retire object OBJ%x", object->fscache.debug_id); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 8c01c5fcdf75..25badd1aec5c 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -38,7 +38,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object, printk(KERN_ERR "%sobject: OBJ%x\n", prefix, object->fscache.debug_id); printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", - prefix, fscache_object_states[object->fscache.state], + prefix, object->fscache.state->name, object->fscache.flags, work_busy(&object->fscache.work), object->fscache.events, object->fscache.event_mask); printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", @@ -127,10 +127,10 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, found_dentry: kdebug("preemptive burial: OBJ%x [%s] %p", object->fscache.debug_id, - fscache_object_states[object->fscache.state], + object->fscache.state->name, dentry); - if (object->fscache.state < FSCACHE_OBJECT_DYING) { + if (fscache_object_is_live(&object->fscache)) { printk(KERN_ERR "\n"); printk(KERN_ERR "CacheFiles: Error:" " Can't preemptively bury live object\n"); @@ -192,7 +192,7 @@ try_again: /* an old object from a previous incarnation is hogging the slot - we * need to wait for it to be destroyed */ wait_for_old_object: - if (xobject->fscache.state < FSCACHE_OBJECT_DYING) { + if (fscache_object_is_live(&object->fscache)) { printk(KERN_ERR "\n"); printk(KERN_ERR "CacheFiles: Error:" " Unexpected object collision\n"); @@ -836,7 +836,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); /* look up the victim */ - mutex_lock_nested(&dir->d_inode->i_mutex, 1); + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); start = jiffies; victim = lookup_one_len(filename, dir, strlen(filename)); diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 480992259707..ebaff368120d 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -12,6 +12,7 @@ #include <linux/mount.h> #include <linux/slab.h> #include <linux/file.h> +#include <linux/swap.h> #include "internal.h" /* @@ -227,8 +228,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op) */ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, struct fscache_retrieval *op, - struct page *netpage, - struct pagevec *pagevec) + struct page *netpage) { struct cachefiles_one_read *monitor; struct address_space *bmapping; @@ -237,8 +237,6 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, _enter(""); - pagevec_reinit(pagevec); - _debug("read back %p{%lu,%d}", netpage, netpage->index, page_count(netpage)); @@ -283,9 +281,7 @@ installed_new_backing_page: backpage = newpage; newpage = NULL; - page_cache_get(backpage); - pagevec_add(pagevec, backpage); - __pagevec_lru_add_file(pagevec); + lru_cache_add_file(backpage); read_backing_page: ret = bmapping->a_ops->readpage(NULL, backpage); @@ -452,8 +448,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, if (block) { /* submit the apparently valid page to the backing fs to be * read from disk */ - ret = cachefiles_read_backing_file_one(object, op, page, - &pagevec); + ret = cachefiles_read_backing_file_one(object, op, page); } else if (cachefiles_has_space(cache, 0, 1) == 0) { /* there's space in the cache we can use */ fscache_mark_page_cached(op, page); @@ -482,14 +477,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, { struct cachefiles_one_read *monitor = NULL; struct address_space *bmapping = object->backer->d_inode->i_mapping; - struct pagevec lru_pvec; struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; int ret = 0; _enter(""); - pagevec_init(&lru_pvec, 0); - list_for_each_entry_safe(netpage, _n, list, lru) { list_del(&netpage->lru); @@ -534,9 +526,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, backpage = newpage; newpage = NULL; - page_cache_get(backpage); - if (!pagevec_add(&lru_pvec, backpage)) - __pagevec_lru_add_file(&lru_pvec); + lru_cache_add_file(backpage); reread_backing_page: ret = bmapping->a_ops->readpage(NULL, backpage); @@ -559,9 +549,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, goto nomem; } - page_cache_get(netpage); - if (!pagevec_add(&lru_pvec, netpage)) - __pagevec_lru_add_file(&lru_pvec); + lru_cache_add_file(netpage); /* install a monitor */ page_cache_get(netpage); @@ -643,9 +631,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, fscache_mark_page_cached(op, netpage); - page_cache_get(netpage); - if (!pagevec_add(&lru_pvec, netpage)) - __pagevec_lru_add_file(&lru_pvec); + lru_cache_add_file(netpage); /* the netpage is unlocked and marked up to date here */ fscache_end_io(op, netpage, 0); @@ -661,8 +647,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, out: /* tidy up */ - pagevec_lru_add_file(&lru_pvec); - if (newpage) page_cache_release(newpage); if (netpage) @@ -962,12 +946,14 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) } data = kmap(page); + file_start_write(file); old_fs = get_fs(); set_fs(KERNEL_DS); ret = file->f_op->write( file, (const void __user *) data, len, &pos); set_fs(old_fs); kunmap(page); + file_end_write(file); if (ret != len) ret = -EIO; } diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 73b46288b54b..2476e5162609 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -109,13 +109,12 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object, struct dentry *dentry = object->dentry; int ret; - ASSERT(object->fscache.cookie); ASSERT(dentry); _enter("%p,#%d", object, auxdata->len); /* attempt to install the cache metadata directly */ - _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len); + _debug("SET #%u", auxdata->len); ret = vfs_setxattr(dentry, cachefiles_xattr_cache, &auxdata->type, auxdata->len, @@ -138,13 +137,12 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object, struct dentry *dentry = object->dentry; int ret; - ASSERT(object->fscache.cookie); ASSERT(dentry); _enter("%p,#%d", object, auxdata->len); /* attempt to install the cache metadata directly */ - _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len); + _debug("SET #%u", auxdata->len); ret = vfs_setxattr(dentry, cachefiles_xattr_cache, &auxdata->type, auxdata->len, diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a60ea977af6f..5318a3b704f6 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -143,7 +143,8 @@ static int ceph_set_page_dirty(struct page *page) * dirty page counters appropriately. Only called if there is private * data on the page. */ -static void ceph_invalidatepage(struct page *page, unsigned long offset) +static void ceph_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode; struct ceph_inode_info *ci; @@ -163,20 +164,20 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset) if (!PageDirty(page)) pr_err("%p invalidatepage %p page not dirty\n", inode, page); - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) ClearPageChecked(page); ci = ceph_inode(inode); - if (offset == 0) { - dout("%p invalidatepage %p idx %lu full dirty page %lu\n", - inode, page, page->index, offset); + if (offset == 0 && length == PAGE_CACHE_SIZE) { + dout("%p invalidatepage %p idx %lu full dirty page\n", + inode, page, page->index); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); page->private = 0; ClearPagePrivate(page); } else { - dout("%p invalidatepage %p idx %lu partial dirty page\n", - inode, page, page->index); + dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n", + inode, page, page->index, offset, length); } } @@ -236,15 +237,21 @@ static int ceph_readpage(struct file *filp, struct page *page) static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) { struct inode *inode = req->r_inode; + struct ceph_osd_data *osd_data; int rc = req->r_result; int bytes = le32_to_cpu(msg->hdr.data_len); + int num_pages; int i; dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ - for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { - struct page *page = req->r_pages[i]; + osd_data = osd_req_op_extent_osd_data(req, 0); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + for (i = 0; i < num_pages; i++) { + struct page *page = osd_data->pages[i]; if (bytes < (int)PAGE_CACHE_SIZE) { /* zero (remainder of) page */ @@ -257,8 +264,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) SetPageUptodate(page); unlock_page(page); page_cache_release(page); + bytes -= PAGE_CACHE_SIZE; } - kfree(req->r_pages); + kfree(osd_data->pages); } static void ceph_unlock_page_vector(struct page **pages, int num_pages) @@ -279,6 +287,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) &ceph_inode_to_client(inode)->client->osdc; struct ceph_inode_info *ci = ceph_inode(inode); struct page *page = list_entry(page_list->prev, struct page, lru); + struct ceph_vino vino; struct ceph_osd_request *req; u64 off; u64 len; @@ -303,18 +312,17 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) len = nr_pages << PAGE_CACHE_SHIFT; dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, off, len); - - req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), - off, &len, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - NULL, 0, + vino = ceph_vino(inode); + req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, + 1, CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, - NULL, false, 0); + false); if (IS_ERR(req)) return PTR_ERR(req); /* build page vector */ - nr_pages = len >> PAGE_CACHE_SHIFT; + nr_pages = calc_pages_for(0, len); pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); ret = -ENOMEM; if (!pages) @@ -336,11 +344,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - req->r_pages = pages; - req->r_num_pages = nr_pages; + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); req->r_callback = finish_read; req->r_inode = inode; + ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); + dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); ret = ceph_osdc_start_request(osdc, req, false); if (ret < 0) @@ -373,7 +382,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; - dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, + dout("readpages %p file %p nr_pages %d max %d\n", inode, + file, nr_pages, max); while (!list_empty(page_list)) { rc = start_read(inode, page_list, max); @@ -429,13 +439,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_inode_info *ci; struct ceph_fs_client *fsc; struct ceph_osd_client *osdc; - loff_t page_off = page_offset(page); - int len = PAGE_CACHE_SIZE; - loff_t i_size; - int err = 0; struct ceph_snap_context *snapc, *oldest; - u64 snap_size = 0; + loff_t page_off = page_offset(page); long writeback_stat; + u64 truncate_size, snap_size = 0; + u32 truncate_seq; + int err = 0, len = PAGE_CACHE_SIZE; dout("writepage %p idx %lu\n", page, page->index); @@ -465,13 +474,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } ceph_put_snap_context(oldest); + spin_lock(&ci->i_ceph_lock); + truncate_seq = ci->i_truncate_seq; + truncate_size = ci->i_truncate_size; + if (!snap_size) + snap_size = i_size_read(inode); + spin_unlock(&ci->i_ceph_lock); + /* is this a partial page at end of file? */ - if (snap_size) - i_size = snap_size; - else - i_size = i_size_read(inode); - if (i_size < page_off + len) - len = i_size - page_off; + if (page_off >= snap_size) { + dout("%p page eof %llu\n", page, snap_size); + goto out; + } + if (snap_size < page_off + len) + len = snap_size - page_off; dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", inode, page, page->index, page_off, len, snapc); @@ -485,7 +501,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, page_off, len, - ci->i_truncate_seq, ci->i_truncate_size, + truncate_seq, truncate_size, &inode->i_mtime, &page, 1); if (err < 0) { dout("writepage setting page/mapping error %d %p\n", err, page); @@ -548,17 +564,23 @@ static void writepages_finish(struct ceph_osd_request *req, { struct inode *inode = req->r_inode; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_data *osd_data; unsigned wrote; struct page *page; + int num_pages; int i; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; int rc = req->r_result; - u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length); + u64 bytes = req->r_ops[0].extent.length; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); long writeback_stat; unsigned issued = ceph_caps_issued(ci); + osd_data = osd_req_op_extent_osd_data(req, 0); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); if (rc >= 0) { /* * Assume we wrote the pages we originally sent. The @@ -566,7 +588,7 @@ static void writepages_finish(struct ceph_osd_request *req, * raced with a truncation and was adjusted at the osd, * so don't believe the reply. */ - wrote = req->r_num_pages; + wrote = num_pages; } else { wrote = 0; mapping_set_error(mapping, rc); @@ -575,8 +597,8 @@ static void writepages_finish(struct ceph_osd_request *req, inode, rc, bytes, wrote); /* clean all pages */ - for (i = 0; i < req->r_num_pages; i++) { - page = req->r_pages[i]; + for (i = 0; i < num_pages; i++) { + page = osd_data->pages[i]; BUG_ON(!page); WARN_ON(!PageUptodate(page)); @@ -605,35 +627,18 @@ static void writepages_finish(struct ceph_osd_request *req, unlock_page(page); } dout("%p wrote+cleaned %d pages\n", inode, wrote); - ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc); + ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); - ceph_release_pages(req->r_pages, req->r_num_pages); - if (req->r_pages_from_pool) - mempool_free(req->r_pages, + ceph_release_pages(osd_data->pages, num_pages); + if (osd_data->pages_from_pool) + mempool_free(osd_data->pages, ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); else - kfree(req->r_pages); + kfree(osd_data->pages); ceph_osdc_put_request(req); } /* - * allocate a page vec, either directly, or if necessary, via a the - * mempool. we avoid the mempool if we can because req->r_num_pages - * may be less than the maximum write size. - */ -static void alloc_page_vec(struct ceph_fs_client *fsc, - struct ceph_osd_request *req) -{ - req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, - GFP_NOFS); - if (!req->r_pages) { - req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); - req->r_pages_from_pool = 1; - WARN_ON(!req->r_pages); - } -} - -/* * initiate async writeback */ static int ceph_writepages_start(struct address_space *mapping, @@ -641,7 +646,8 @@ static int ceph_writepages_start(struct address_space *mapping, { struct inode *inode = mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_vino vino = ceph_vino(inode); pgoff_t index, start, end; int range_whole = 0; int should_loop = 1; @@ -653,22 +659,22 @@ static int ceph_writepages_start(struct address_space *mapping, unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; int do_sync; - u64 snap_size = 0; + u64 truncate_size, snap_size; + u32 truncate_seq; /* * Include a 'sync' in the OSD request if this is a data * integrity write (e.g., O_SYNC write or fsync()), or if our * cap is being revoked. */ - do_sync = wbc->sync_mode == WB_SYNC_ALL; - if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) + if ((wbc->sync_mode == WB_SYNC_ALL) || + ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) do_sync = 1; dout("writepages_start %p dosync=%d (mode=%s)\n", inode, do_sync, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - fsc = ceph_inode_to_client(inode); if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { pr_warning("writepage_start %p on forced umount\n", inode); return -EIO; /* we're in a forced umount, don't write! */ @@ -699,6 +705,7 @@ static int ceph_writepages_start(struct address_space *mapping, retry: /* find oldest snap context with dirty data */ ceph_put_snap_context(snapc); + snap_size = 0; snapc = get_oldest_context(inode, &snap_size); if (!snapc) { /* hmm, why does writepages get called when there @@ -706,8 +713,18 @@ retry: dout(" no snap context with dirty data?\n"); goto out; } + if (snap_size == 0) + snap_size = i_size_read(inode); dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); + + spin_lock(&ci->i_ceph_lock); + truncate_seq = ci->i_truncate_seq; + truncate_size = ci->i_truncate_size; + if (!snap_size) + snap_size = i_size_read(inode); + spin_unlock(&ci->i_ceph_lock); + if (last_snapc && snapc != last_snapc) { /* if we switched to a newer snapc, restart our scan at the * start of the original file range. */ @@ -718,10 +735,13 @@ retry: last_snapc = snapc; while (!done && index <= end) { + int num_ops = do_sync ? 2 : 1; unsigned i; int first; pgoff_t next; int pvec_pages, locked_pages; + struct page **pages = NULL; + mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; int want; u64 offset, len; @@ -773,11 +793,8 @@ get_more_pages: dout("waiting on writeback %p\n", page); wait_on_page_writeback(page); } - if ((snap_size && page_offset(page) > snap_size) || - (!snap_size && - page_offset(page) > i_size_read(inode))) { - dout("%p page eof %llu\n", page, snap_size ? - snap_size : i_size_read(inode)); + if (page_offset(page) >= snap_size) { + dout("%p page eof %llu\n", page, snap_size); done = 1; unlock_page(page); break; @@ -805,34 +822,42 @@ get_more_pages: break; } - /* ok */ + /* + * We have something to write. If this is + * the first locked page this time through, + * allocate an osd request and a page array + * that it will use. + */ if (locked_pages == 0) { + BUG_ON(pages); /* prepare async write request */ - offset = (u64) page_offset(page); + offset = (u64)page_offset(page); len = wsize; req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, - ceph_vino(inode), - offset, &len, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, - snapc, do_sync, - ci->i_truncate_seq, - ci->i_truncate_size, - &inode->i_mtime, true, 0); - + &ci->i_layout, vino, + offset, &len, num_ops, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + snapc, truncate_seq, + truncate_size, true); if (IS_ERR(req)) { rc = PTR_ERR(req); unlock_page(page); break; } - max_pages = req->r_num_pages; - - alloc_page_vec(fsc, req); req->r_callback = writepages_finish; req->r_inode = inode; + + max_pages = calc_pages_for(0, (u64)len); + pages = kmalloc(max_pages * sizeof (*pages), + GFP_NOFS); + if (!pages) { + pool = fsc->wb_pagevec_pool; + pages = mempool_alloc(pool, GFP_NOFS); + BUG_ON(!pages); + } } /* note position of first page in pvec */ @@ -850,7 +875,7 @@ get_more_pages: } set_page_writeback(page); - req->r_pages[locked_pages] = page; + pages[locked_pages] = page; locked_pages++; next = page->index + 1; } @@ -879,18 +904,27 @@ get_more_pages: pvec.nr -= i-first; } - /* submit the write */ - offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; - len = min((snap_size ? snap_size : i_size_read(inode)) - offset, + /* Format the osd request message and submit the write */ + + offset = page_offset(pages[0]); + len = min(snap_size - offset, (u64)locked_pages << PAGE_CACHE_SHIFT); dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); - /* revise final length, page count */ - req->r_num_pages = locked_pages; - req->r_request_ops[0].extent.length = cpu_to_le64(len); - req->r_request_ops[0].payload_len = cpu_to_le32(len); - req->r_request->hdr.data_len = cpu_to_le32(len); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, + !!pool, false); + + pages = NULL; /* request message now owns the pages array */ + pool = NULL; + + /* Update the write op length in case we changed it */ + + osd_req_op_extent_update(req, 0, len); + + vino = ceph_vino(inode); + ceph_osdc_build_request(req, offset, snapc, vino.snap, + &inode->i_mtime); rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); @@ -1067,51 +1101,23 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_file_info *fi = file->private_data; struct page *page; pgoff_t index = pos >> PAGE_CACHE_SHIFT; - int r, want, got = 0; - - if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_BUFFER; - - dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos, len, inode->i_size); - r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len); - if (r < 0) - return r; - dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); - if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) { - ceph_put_cap_refs(ci, got); - return -EAGAIN; - } + int r; do { /* get a page */ page = grab_cache_page_write_begin(mapping, index, 0); - if (!page) { - r = -ENOMEM; - break; - } + if (!page) + return -ENOMEM; + *pagep = page; dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); r = ceph_update_writeable_page(file, pos, len, page); - if (r) - page_cache_release(page); } while (r == -EAGAIN); - if (r) { - ceph_put_cap_refs(ci, got); - } else { - *pagep = page; - *(int *)fsdata = got; - } return r; } @@ -1125,12 +1131,10 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; - int got = (unsigned long)fsdata; dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, inode, page, (int)pos, (int)copied, (int)len); @@ -1153,19 +1157,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, up_read(&mdsc->snap_rwsem); page_cache_release(page); - if (copied > 0) { - int dirty; - spin_lock(&ci->i_ceph_lock); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); - spin_unlock(&ci->i_ceph_lock); - if (dirty) - __mark_inode_dirty(inode, dirty); - } - - dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); - ceph_put_cap_refs(ci, got); - if (check_cap) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 78e2f575247d..25442b40c25a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -147,7 +147,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) spin_unlock(&mdsc->caps_list_lock); } -int ceph_reserve_caps(struct ceph_mds_client *mdsc, +void ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need) { int i; @@ -155,7 +155,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, int have; int alloc = 0; LIST_HEAD(newcaps); - int ret = 0; dout("reserve caps ctx=%p need=%d\n", ctx, need); @@ -174,14 +173,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, for (i = have; i < need; i++) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); - if (!cap) { - ret = -ENOMEM; - goto out_alloc_count; - } + if (!cap) + break; list_add(&cap->caps_item, &newcaps); alloc++; } - BUG_ON(have + alloc != need); + /* we didn't manage to reserve as much as we needed */ + if (have + alloc != need) + pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", + ctx, need, have + alloc); spin_lock(&mdsc->caps_list_lock); mdsc->caps_total_count += alloc; @@ -197,13 +197,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", ctx, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); - return 0; - -out_alloc_count: - /* we didn't manage to reserve as much as we needed */ - pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n", - ctx, need, have); - return ret; } int ceph_unreserve_caps(struct ceph_mds_client *mdsc, @@ -490,15 +483,17 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, ci->i_rdcache_gen++; /* - * if we are newly issued FILE_SHARED, clear D_COMPLETE; we + * if we are newly issued FILE_SHARED, mark dir not complete; we * don't know what happened to this directory while we didn't * have the cap. */ if ((issued & CEPH_CAP_FILE_SHARED) && (had & CEPH_CAP_FILE_SHARED) == 0) { ci->i_shared_gen++; - if (S_ISDIR(ci->vfs_inode.i_mode)) - ceph_dir_clear_complete(&ci->vfs_inode); + if (S_ISDIR(ci->vfs_inode.i_mode)) { + dout(" marking %p NOT complete\n", &ci->vfs_inode); + __ceph_dir_clear_complete(ci); + } } } @@ -553,6 +548,7 @@ retry: cap->implemented = 0; cap->mds = mds; cap->mds_wanted = 0; + cap->mseq = 0; cap->ci = ci; __insert_cap_node(ci, cap); @@ -609,9 +605,11 @@ retry: __cap_delay_requeue(mdsc, ci); } - if (flags & CEPH_CAP_FLAG_AUTH) - ci->i_auth_cap = cap; - else if (ci->i_auth_cap == cap) { + if (flags & CEPH_CAP_FLAG_AUTH) { + if (ci->i_auth_cap == NULL || + ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) + ci->i_auth_cap = cap; + } else if (ci->i_auth_cap == cap) { ci->i_auth_cap = NULL; spin_lock(&mdsc->cap_dirty_lock); if (!list_empty(&ci->i_dirty_item)) { @@ -628,7 +626,10 @@ retry: cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; - cap->mds_wanted |= wanted; + if (mseq > cap->mseq) + cap->mds_wanted = wanted; + else + cap->mds_wanted |= wanted; cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; @@ -689,6 +690,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) if (implemented) *implemented |= cap->implemented; } + /* + * exclude caps issued by non-auth MDS, but are been revoking + * by the auth MDS. The non-auth MDS should be revoking/exporting + * these caps, but the message is delayed. + */ + if (ci->i_auth_cap) { + cap = ci->i_auth_cap; + have &= ~cap->implemented | cap->issued; + } return have; } @@ -796,22 +806,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) /* * Return true if mask caps are currently being revoked by an MDS. */ -int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask) { - struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; struct rb_node *p; - int ret = 0; - spin_lock(&ci->i_ceph_lock); for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (__cap_is_valid(cap) && - (cap->implemented & ~cap->issued & mask)) { - ret = 1; - break; - } + if (cap != ocap && __cap_is_valid(cap) && + (cap->implemented & ~cap->issued & mask)) + return 1; } + return 0; +} + +int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +{ + struct inode *inode = &ci->vfs_inode; + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_caps_revoking_other(ci, NULL, mask); spin_unlock(&ci->i_ceph_lock); dout("ceph_caps_revoking %p %s = %d\n", inode, ceph_cap_string(mask), ret); @@ -997,9 +1013,9 @@ static int send_cap_msg(struct ceph_mds_session *session, return 0; } -static void __queue_cap_release(struct ceph_mds_session *session, - u64 ino, u64 cap_id, u32 migrate_seq, - u32 issue_seq) +void __queue_cap_release(struct ceph_mds_session *session, + u64 ino, u64 cap_id, u32 migrate_seq, + u32 issue_seq) { struct ceph_msg *msg; struct ceph_mds_cap_release *head; @@ -1974,8 +1990,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, cap = ci->i_auth_cap; dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + __ceph_flush_snaps(ci, &session, 1); + if (ci->i_flushing_caps) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &cap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, __ceph_caps_used(ci), __ceph_caps_wanted(ci), @@ -2046,6 +2069,17 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, goto out; } + /* finish pending truncate */ + while (ci->i_truncate_pending) { + spin_unlock(&ci->i_ceph_lock); + if (!(need & CEPH_CAP_FILE_WR)) + mutex_lock(&inode->i_mutex); + __ceph_do_pending_vmtruncate(inode); + if (!(need & CEPH_CAP_FILE_WR)) + mutex_unlock(&inode->i_mutex); + spin_lock(&ci->i_ceph_lock); + } + if (need & CEPH_CAP_FILE_WR) { if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { dout("get_cap_refs %p endoff %llu > maxsize %llu\n", @@ -2067,12 +2101,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, } have = __ceph_caps_issued(ci, &implemented); - /* - * disallow writes while a truncate is pending - */ - if (ci->i_truncate_pending) - have &= ~CEPH_CAP_FILE_WR; - if ((have & need) == need) { /* * Look at (implemented & ~have & not) so that we keep waiting @@ -2466,6 +2494,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } else { dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); + /* non-auth MDS is revoking the newly grant caps ? */ + if (cap == ci->i_auth_cap && + __ceph_caps_revoking_other(ci, cap, newcaps)) + check_caps = 2; + cap->issued = newcaps; cap->implemented |= newcaps; /* add bits only, to * avoid stepping on a @@ -3035,21 +3068,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode, (cap->issued & unless) == 0)) { if ((cap->issued & drop) && (cap->issued & unless) == 0) { - dout("encode_inode_release %p cap %p %s -> " - "%s\n", inode, cap, + int wanted = __ceph_caps_wanted(ci); + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) + wanted |= cap->mds_wanted; + dout("encode_inode_release %p cap %p " + "%s -> %s, wanted %s -> %s\n", inode, cap, ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & ~drop)); + ceph_cap_string(cap->issued & ~drop), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(wanted)); + cap->issued &= ~drop; cap->implemented &= ~drop; - if (ci->i_ceph_flags & CEPH_I_NODELAY) { - int wanted = __ceph_caps_wanted(ci); - dout(" wanted %s -> %s (act %s)\n", - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(cap->mds_wanted & - ~wanted), - ceph_cap_string(wanted)); - cap->mds_wanted &= wanted; - } + cap->mds_wanted = wanted; } else { dout("encode_inode_release %p cap %p %s" " (force)\n", inode, cap, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 6d797f46d772..a40ceda47a32 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -107,15 +107,14 @@ static unsigned fpos_off(loff_t p) * falling back to a "normal" sync readdir if any dentries in the dir * are dropped. * - * D_COMPLETE tells indicates we have all dentries in the dir. It is + * Complete dir indicates that we have all dentries in the dir. It is * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by * the MDS if/when the directory is modified). */ -static int __dcache_readdir(struct file *filp, - void *dirent, filldir_t filldir) +static int __dcache_readdir(struct file *file, struct dir_context *ctx) { - struct ceph_file_info *fi = filp->private_data; - struct dentry *parent = filp->f_dentry; + struct ceph_file_info *fi = file->private_data; + struct dentry *parent = file->f_dentry; struct inode *dir = parent->d_inode; struct list_head *p; struct dentry *dentry, *last; @@ -126,14 +125,14 @@ static int __dcache_readdir(struct file *filp, last = fi->dentry; fi->dentry = NULL; - dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, + dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos, last); spin_lock(&parent->d_lock); /* start at beginning? */ - if (filp->f_pos == 2 || last == NULL || - filp->f_pos < ceph_dentry(last)->offset) { + if (ctx->pos == 2 || last == NULL || + ctx->pos < ceph_dentry(last)->offset) { if (list_empty(&parent->d_subdirs)) goto out_unlock; p = parent->d_subdirs.prev; @@ -157,11 +156,11 @@ more: if (!d_unhashed(dentry) && dentry->d_inode && ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && - filp->f_pos <= di->offset) + ctx->pos <= di->offset) break; dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, dentry->d_name.len, dentry->d_name.name, di->offset, - filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", + ctx->pos, d_unhashed(dentry) ? " unhashed" : "", !dentry->d_inode ? " null" : ""); spin_unlock(&dentry->d_lock); p = p->prev; @@ -173,33 +172,31 @@ more: spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); - dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, + dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos, dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - filp->f_pos = di->offset; - err = filldir(dirent, dentry->d_name.name, - dentry->d_name.len, di->offset, + ctx->pos = di->offset; + if (!dir_emit(ctx, dentry->d_name.name, + dentry->d_name.len, ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), - dentry->d_inode->i_mode >> 12); - - if (last) { - if (err < 0) { + dentry->d_inode->i_mode >> 12)) { + if (last) { /* remember our position */ fi->dentry = last; fi->next_offset = di->offset; - } else { - dput(last); } + dput(dentry); + return 0; } - last = dentry; - if (err < 0) - goto out; + if (last) + dput(last); + last = dentry; - filp->f_pos++; + ctx->pos++; /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_test_complete(dir)) { - dout(" lost D_COMPLETE on %p; falling back to mds\n", dir); + if (!ceph_dir_is_complete(dir)) { + dout(" lost dir complete on %p; falling back to mds\n", dir); err = -EAGAIN; goto out; } @@ -235,59 +232,59 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name, return 0; } -static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int ceph_readdir(struct file *file, struct dir_context *ctx) { - struct ceph_file_info *fi = filp->private_data; - struct inode *inode = file_inode(filp); + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; - unsigned frag = fpos_frag(filp->f_pos); - int off = fpos_off(filp->f_pos); + unsigned frag = fpos_frag(ctx->pos); + int off = fpos_off(ctx->pos); int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; const int max_entries = fsc->mount_options->max_readdir; const int max_bytes = fsc->mount_options->max_readdir_bytes; - dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); + dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); if (fi->flags & CEPH_F_ATEND) return 0; /* always start with . and .. */ - if (filp->f_pos == 0) { + if (ctx->pos == 0) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ - fi->dir_release_count = ci->i_release_count; + fi->dir_release_count = atomic_read(&ci->i_release_count); dout("readdir off 0 -> '.'\n"); - if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), + if (!dir_emit(ctx, ".", 1, ceph_translate_ino(inode->i_sb, inode->i_ino), - inode->i_mode >> 12) < 0) + inode->i_mode >> 12)) return 0; - filp->f_pos = 1; + ctx->pos = 1; off = 1; } - if (filp->f_pos == 1) { - ino_t ino = parent_ino(filp->f_dentry); + if (ctx->pos == 1) { + ino_t ino = parent_ino(file->f_dentry); dout("readdir off 1 -> '..'\n"); - if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), + if (!dir_emit(ctx, "..", 2, ceph_translate_ino(inode->i_sb, ino), - inode->i_mode >> 12) < 0) + inode->i_mode >> 12)) return 0; - filp->f_pos = 2; + ctx->pos = 2; off = 2; } /* can we use the dcache? */ spin_lock(&ci->i_ceph_lock); - if ((filp->f_pos == 2 || fi->dentry) && + if ((ctx->pos == 2 || fi->dentry) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && - ceph_dir_test_complete(inode) && + __ceph_dir_is_complete(ci) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { spin_unlock(&ci->i_ceph_lock); - err = __dcache_readdir(filp, dirent, filldir); + err = __dcache_readdir(file, ctx); if (err != -EAGAIN) return err; } else { @@ -327,7 +324,7 @@ more: return PTR_ERR(req); req->r_inode = inode; ihold(inode); - req->r_dentry = dget(filp->f_dentry); + req->r_dentry = dget(file->f_dentry); /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; req->r_direct_hash = ceph_frag_value(frag); @@ -350,7 +347,8 @@ more: if (!req->r_did_prepopulate) { dout("readdir !did_prepopulate"); - fi->dir_release_count--; /* preclude D_COMPLETE */ + /* preclude from marking dir complete */ + fi->dir_release_count--; } /* note next offset and last dentry name */ @@ -378,15 +376,16 @@ more: rinfo = &fi->last_readdir->r_reply_info; dout("readdir frag %x num %d off %d chunkoff %d\n", frag, rinfo->dir_nr, off, fi->offset); + + ctx->pos = ceph_make_fpos(frag, off); while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { - u64 pos = ceph_make_fpos(frag, off); struct ceph_mds_reply_inode *in = rinfo->dir_in[off - fi->offset].in; struct ceph_vino vino; ino_t ino; dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", - off, off - fi->offset, rinfo->dir_nr, pos, + off, off - fi->offset, rinfo->dir_nr, ctx->pos, rinfo->dir_dname_len[off - fi->offset], rinfo->dir_dname[off - fi->offset], in); BUG_ON(!in); @@ -394,16 +393,15 @@ more: vino.ino = le64_to_cpu(in->ino); vino.snap = le64_to_cpu(in->snapid); ino = ceph_vino_to_ino(vino); - if (filldir(dirent, + if (!dir_emit(ctx, rinfo->dir_dname[off - fi->offset], rinfo->dir_dname_len[off - fi->offset], - pos, - ceph_translate_ino(inode->i_sb, ino), ftype) < 0) { + ceph_translate_ino(inode->i_sb, ino), ftype)) { dout("filldir stopping us...\n"); return 0; } off++; - filp->f_pos = pos + 1; + ctx->pos++; } if (fi->last_name) { @@ -416,7 +414,7 @@ more: if (!ceph_frag_is_rightmost(frag)) { frag = ceph_frag_next(frag); off = 0; - filp->f_pos = ceph_make_fpos(frag, off); + ctx->pos = ceph_make_fpos(frag, off); dout("readdir next frag is %x\n", frag); goto more; } @@ -428,13 +426,14 @@ more: * the complete dir contents in our cache. */ spin_lock(&ci->i_ceph_lock); - if (ci->i_release_count == fi->dir_release_count) { - ceph_dir_set_complete(inode); - ci->i_max_offset = filp->f_pos; + if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { + dout(" marking %p complete\n", inode); + __ceph_dir_set_complete(ci, fi->dir_release_count); + ci->i_max_offset = ctx->pos; } spin_unlock(&ci->i_ceph_lock); - dout("readdir %p filp %p done.\n", inode, filp); + dout("readdir %p file %p done.\n", inode, file); return 0; } @@ -604,7 +603,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && - ceph_dir_test_complete(dir) && + __ceph_dir_is_complete(ci) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { spin_unlock(&ci->i_ceph_lock); dout(" dir %p complete, -ENOENT\n", dir); @@ -1065,44 +1064,6 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, } /* - * Set/clear/test dir complete flag on the dir's dentry. - */ -void ceph_dir_set_complete(struct inode *inode) -{ - struct dentry *dentry = d_find_any_alias(inode); - - if (dentry && ceph_dentry(dentry) && - ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) { - dout(" marking %p (%p) complete\n", inode, dentry); - set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); - } - dput(dentry); -} - -void ceph_dir_clear_complete(struct inode *inode) -{ - struct dentry *dentry = d_find_any_alias(inode); - - if (dentry && ceph_dentry(dentry)) { - dout(" marking %p (%p) complete\n", inode, dentry); - set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); - } - dput(dentry); -} - -bool ceph_dir_test_complete(struct inode *inode) -{ - struct dentry *dentry = d_find_any_alias(inode); - - if (dentry && ceph_dentry(dentry)) { - dout(" marking %p (%p) NOT complete\n", inode, dentry); - clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); - } - dput(dentry); - return false; -} - -/* * When the VFS prunes a dentry from the cache, we need to clear the * complete flag on the parent directory. * @@ -1110,15 +1071,13 @@ bool ceph_dir_test_complete(struct inode *inode) */ static void ceph_d_prune(struct dentry *dentry) { - struct ceph_dentry_info *di; - dout("ceph_d_prune %p\n", dentry); /* do we have a valid parent? */ if (IS_ROOT(dentry)) return; - /* if we are not hashed, we don't affect D_COMPLETE */ + /* if we are not hashed, we don't affect dir's completeness */ if (d_unhashed(dentry)) return; @@ -1126,8 +1085,7 @@ static void ceph_d_prune(struct dentry *dentry) * we hold d_lock, so d_parent is stable, and d_fsdata is never * cleared until d_release */ - di = ceph_dentry(dentry->d_parent); - clear_bit(CEPH_D_COMPLETE, &di->flags); + ceph_dir_clear_complete(dentry->d_parent->d_inode); } /* @@ -1307,7 +1265,7 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) const struct file_operations ceph_dir_fops = { .read = ceph_read_dir, - .readdir = ceph_readdir, + .iterate = ceph_readdir, .llseek = ceph_dir_llseek, .open = ceph_open, .release = ceph_release, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index bf338d9b67e3..2ddf061c1c4a 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -7,6 +7,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/writeback.h> +#include <linux/aio.h> #include "super.h" #include "mds_client.h" @@ -446,19 +447,35 @@ done: } /* - * Write commit callback, called if we requested both an ACK and - * ONDISK commit reply from the OSD. + * Write commit request unsafe callback, called to tell us when a + * request is unsafe (that is, in flight--has been handed to the + * messenger to send to its target osd). It is called again when + * we've received a response message indicating the request is + * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request + * is completed early (and unsuccessfully) due to a timeout or + * interrupt. + * + * This is used if we requested both an ACK and ONDISK commit reply + * from the OSD. */ -static void sync_write_commit(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) { struct ceph_inode_info *ci = ceph_inode(req->r_inode); - dout("sync_write_commit %p tid %llu\n", req, req->r_tid); - spin_lock(&ci->i_unsafe_lock); - list_del_init(&req->r_unsafe_item); - spin_unlock(&ci->i_unsafe_lock); - ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid, + unsafe ? "un" : ""); + if (unsafe) { + ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_item, + &ci->i_unsafe_writes); + spin_unlock(&ci->i_unsafe_lock); + } else { + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_item); + spin_unlock(&ci->i_unsafe_lock); + ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + } } /* @@ -470,36 +487,33 @@ static void sync_write_commit(struct ceph_osd_request *req, * objects, rollback on failure, etc.) */ static ssize_t ceph_sync_write(struct file *file, const char __user *data, - size_t left, loff_t *offset) + size_t left, loff_t pos, loff_t *ppos) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_snap_context *snapc; + struct ceph_vino vino; struct ceph_osd_request *req; + int num_ops = 1; struct page **pages; int num_pages; - long long unsigned pos; u64 len; int written = 0; int flags; - int do_sync = 0; int check_caps = 0; int page_align, io_align; unsigned long buf_align; int ret; struct timespec mtime = CURRENT_TIME; + bool own_pages = false; if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_write on file %p %lld~%u %s\n", file, *offset, + dout("sync_write on file %p %lld~%u %s\n", file, pos, (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - if (file->f_flags & O_APPEND) - pos = i_size_read(inode); - else - pos = *offset; - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); if (ret < 0) return ret; @@ -516,7 +530,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) flags |= CEPH_OSD_FLAG_ACK; else - do_sync = 1; + num_ops++; /* Also include a 'startsync' command. */ /* * we may need to do multiple writes here if we span an object @@ -526,25 +540,20 @@ more: io_align = pos & ~PAGE_MASK; buf_align = (unsigned long)data & ~PAGE_MASK; len = left; - if (file->f_flags & O_DIRECT) { - /* write from beginning of first page, regardless of - io alignment */ - page_align = (pos - io_align + buf_align) & ~PAGE_MASK; - num_pages = calc_pages_for((unsigned long)data, len); - } else { - page_align = pos & ~PAGE_MASK; - num_pages = calc_pages_for(pos, len); - } + + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - ceph_vino(inode), pos, &len, - CEPH_OSD_OP_WRITE, flags, - ci->i_snap_realm->cached_context, - do_sync, + vino, pos, &len, num_ops, + CEPH_OSD_OP_WRITE, flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, - &mtime, false, page_align); + false); if (IS_ERR(req)) return PTR_ERR(req); + /* write from beginning of first page, regardless of io alignment */ + page_align = file->f_flags & O_DIRECT ? buf_align : io_align; + num_pages = calc_pages_for(page_align, len); if (file->f_flags & O_DIRECT) { pages = ceph_get_direct_page_vector(data, num_pages, false); if (IS_ERR(pages)) { @@ -572,36 +581,20 @@ more: if ((file->f_flags & O_SYNC) == 0) { /* get a second commit callback */ - req->r_safe_callback = sync_write_commit; - req->r_own_pages = 1; + req->r_unsafe_callback = ceph_sync_write_unsafe; + req->r_inode = inode; + own_pages = true; } } - req->r_pages = pages; - req->r_num_pages = num_pages; - req->r_inode = inode; + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, + false, own_pages); + + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!ret) { - if (req->r_safe_callback) { - /* - * Add to inode unsafe list only after we - * start_request so that a tid has been assigned. - */ - spin_lock(&ci->i_unsafe_lock); - list_add_tail(&req->r_unsafe_item, - &ci->i_unsafe_writes); - spin_unlock(&ci->i_unsafe_lock); - ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); - } - + if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); - if (ret < 0 && req->r_safe_callback) { - spin_lock(&ci->i_unsafe_lock); - list_del_init(&req->r_unsafe_item); - spin_unlock(&ci->i_unsafe_lock); - ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); - } - } if (file->f_flags & O_DIRECT) ceph_put_page_vector(pages, num_pages, false); @@ -614,12 +607,12 @@ out: pos += len; written += len; left -= len; - data += written; + data += len; if (left) goto more; ret = written; - *offset = pos; + *ppos = pos; if (pos > i_size_read(inode)) check_caps = ceph_inode_set_size(inode, pos); if (check_caps) @@ -653,7 +646,6 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", inode, ceph_vinop(inode), pos, (unsigned)len, inode); again: - __ceph_do_pending_vmtruncate(inode); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else @@ -717,55 +709,74 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; - loff_t endoff = pos + iov->iov_len; - int got = 0; - int ret, err, written; + ssize_t count, written = 0; + int err, want, got; + bool hold_mutex; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; -retry_snap: - written = 0; - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) - return -ENOSPC; - __ceph_do_pending_vmtruncate(inode); + mutex_lock(&inode->i_mutex); + hold_mutex = true; - /* - * try to do a buffered write. if we don't have sufficient - * caps, we'll get -EAGAIN from generic_file_aio_write, or a - * short write if we only get caps for some pages. - */ - if (!(iocb->ki_filp->f_flags & O_DIRECT) && - !(inode->i_sb->s_flags & MS_SYNCHRONOUS) && - !(fi->flags & CEPH_F_SYNC)) { - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - if (ret >= 0) - written = ret; - - if ((ret >= 0 || ret == -EIOCBQUEUED) && - ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) - || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { - err = vfs_fsync_range(file, pos, pos + written - 1, 1); - if (err < 0) - ret = err; - } - if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff) - goto out; + err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); + if (err) + goto out; + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = file->f_mapping->backing_dev_info; + + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + + if (count == 0) + goto out; + + err = file_remove_suid(file); + if (err) + goto out; + + err = file_update_time(file); + if (err) + goto out; + +retry_snap: + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { + err = -ENOSPC; + goto out; } - dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos + written, - (unsigned)iov->iov_len - written, inode->i_size); - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff); - if (ret < 0) + dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos, count, inode->i_size); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + got = 0; + err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count); + if (err < 0) goto out; - dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos + written, - (unsigned)iov->iov_len - written, ceph_cap_string(got)); - ret = ceph_sync_write(file, iov->iov_base + written, - iov->iov_len - written, &iocb->ki_pos); - if (ret >= 0) { + dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", + inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); + + if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || + (iocb->ki_filp->f_flags & O_DIRECT) || + (inode->i_sb->s_flags & MS_SYNCHRONOUS) || + (fi->flags & CEPH_F_SYNC)) { + mutex_unlock(&inode->i_mutex); + written = ceph_sync_write(file, iov->iov_base, count, + pos, &iocb->ki_pos); + } else { + written = generic_file_buffered_write(iocb, iov, nr_segs, + pos, &iocb->ki_pos, + count, 0); + mutex_unlock(&inode->i_mutex); + } + hold_mutex = false; + + if (written >= 0) { int dirty; spin_lock(&ci->i_ceph_lock); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); @@ -773,18 +784,33 @@ retry_snap: if (dirty) __mark_inode_dirty(inode, dirty); } + dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos + written, - (unsigned)iov->iov_len - written, ceph_cap_string(got)); + inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + ceph_cap_string(got)); ceph_put_cap_refs(ci, got); -out: - if (ret == -EOLDSNAPC) { + + if (written >= 0 && + ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { + err = vfs_fsync_range(file, pos, pos + written - 1, 1); + if (err < 0) + written = err; + } + + if (written == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); + mutex_lock(&inode->i_mutex); + hold_mutex = true; goto retry_snap; } +out: + if (hold_mutex) + mutex_unlock(&inode->i_mutex); + current->backing_dev_info = NULL; - return ret; + return written ? written : err; } /* @@ -838,16 +864,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) break; } - if (offset < 0 || offset > inode->i_sb->s_maxbytes) { - offset = -EINVAL; - goto out; - } - - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: mutex_unlock(&inode->i_mutex); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 851814d951cd..f3a2abf28a77 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -302,7 +302,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; - ci->i_release_count = 0; + atomic_set(&ci->i_release_count, 1); + atomic_set(&ci->i_complete_count, 0); ci->i_symlink = NULL; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); @@ -561,7 +562,6 @@ static int fill_inode(struct inode *inode, struct ceph_inode_info *ci = ceph_inode(inode); int i; int issued = 0, implemented; - int updating_inode = 0; struct timespec mtime, atime, ctime; u32 nsplits; struct ceph_buffer *xattr_blob = NULL; @@ -601,7 +601,6 @@ static int fill_inode(struct inode *inode, (ci->i_version & ~1) >= le64_to_cpu(info->version)) goto no_change; - updating_inode = 1; issued = __ceph_caps_issued(ci, &implemented); issued |= implemented | __ceph_caps_dirty(ci); @@ -717,6 +716,17 @@ static int fill_inode(struct inode *inode, ceph_vinop(inode), inode->i_mode); } + /* set dir completion flag? */ + if (S_ISDIR(inode->i_mode) && + ci->i_files == 0 && ci->i_subdirs == 0 && + ceph_snap(inode) == CEPH_NOSNAP && + (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + !__ceph_dir_is_complete(ci)) { + dout(" marking %p complete (empty)\n", inode); + __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); + ci->i_max_offset = 2; + } no_change: spin_unlock(&ci->i_ceph_lock); @@ -767,19 +777,6 @@ no_change: __ceph_get_fmode(ci, cap_fmode); } - /* set dir completion flag? */ - if (S_ISDIR(inode->i_mode) && - updating_inode && /* didn't jump to no_change */ - ci->i_files == 0 && ci->i_subdirs == 0 && - ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && - (issued & CEPH_CAP_FILE_EXCL) == 0 && - !ceph_dir_test_complete(inode)) { - dout(" marking %p complete (empty)\n", inode); - ceph_dir_set_complete(inode); - ci->i_max_offset = 2; - } - /* update delegation info? */ if (dirinfo) ceph_fill_dirfrag(inode, dirinfo); @@ -861,7 +858,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) di = ceph_dentry(dn); spin_lock(&ci->i_ceph_lock); - if (!ceph_dir_test_complete(inode)) { + if (!__ceph_dir_is_complete(ci)) { spin_unlock(&ci->i_ceph_lock); return; } @@ -906,8 +903,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } else if (realdn) { dout("dn %p (%d) spliced with %p (%d) " "inode %p ino %llx.%llx\n", - dn, dn->d_count, - realdn, realdn->d_count, + dn, d_count(dn), + realdn, d_count(realdn), realdn->d_inode, ceph_vinop(realdn->d_inode)); dput(dn); dn = realdn; @@ -1065,8 +1062,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, /* * d_move() puts the renamed dentry at the end of * d_subdirs. We need to assign it an appropriate - * directory offset so we can behave when holding - * D_COMPLETE. + * directory offset so we can behave when dir is + * complete. */ ceph_set_dentry_offset(req->r_old_dentry); dout("dn %p gets new offset %lld\n", req->r_old_dentry, @@ -1457,7 +1454,7 @@ out: /* - * called by trunc_wq; take i_mutex ourselves + * called by trunc_wq; * * We also truncate in a separate thread as well. */ @@ -1494,8 +1491,6 @@ void ceph_queue_vmtruncate(struct inode *inode) } /* - * called with i_mutex held. - * * Make sure any pending truncation is applied before doing anything * that may depend on it. */ @@ -1563,6 +1558,12 @@ static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd) static const struct inode_operations ceph_symlink_iops = { .readlink = generic_readlink, .follow_link = ceph_sym_follow_link, + .setattr = ceph_setattr, + .getattr = ceph_getattr, + .setxattr = ceph_setxattr, + .getxattr = ceph_getxattr, + .listxattr = ceph_listxattr, + .removexattr = ceph_removexattr, }; /* diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 4a989345b37b..e0b4ef31d3c8 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -208,8 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout, - osdc->osdmap); + + ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, + ceph_file_layout_pg_pool(ci->i_layout)); dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); if (dl.osd >= 0) { diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 202dd3d68be0..ae6d14e82b0f 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -169,7 +169,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) } /** - * Must be called with BKL already held. Fills in the passed + * Must be called with lock_flocks() already held. Fills in the passed * counter variables, so you can prepare pagelist metadata before calling * ceph_encode_locks. */ @@ -191,27 +191,23 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) } /** - * Encode the flock and fcntl locks for the given inode into the pagelist. - * Format is: #fcntl locks, sequential fcntl locks, #flock locks, - * sequential flock locks. - * Must be called with lock_flocks() already held. - * If we encounter more of a specific lock type than expected, - * we return the value 1. + * Encode the flock and fcntl locks for the given inode into the ceph_filelock + * array. Must be called with inode->i_lock already held. + * If we encounter more of a specific lock type than expected, return -ENOSPC. */ -int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, - int num_fcntl_locks, int num_flock_locks) +int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, int num_flock_locks) { struct file_lock *lock; - struct ceph_filelock cephlock; int err = 0; int seen_fcntl = 0; int seen_flock = 0; + int l = 0; dout("encoding %d flock and %d fcntl locks", num_flock_locks, num_fcntl_locks); - err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32)); - if (err) - goto fail; + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_POSIX) { ++seen_fcntl; @@ -219,19 +215,12 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, err = -ENOSPC; goto fail; } - err = lock_to_ceph_filelock(lock, &cephlock); + err = lock_to_ceph_filelock(lock, &flocks[l]); if (err) goto fail; - err = ceph_pagelist_append(pagelist, &cephlock, - sizeof(struct ceph_filelock)); + ++l; } - if (err) - goto fail; } - - err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32)); - if (err) - goto fail; for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_FLOCK) { ++seen_flock; @@ -239,19 +228,51 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, err = -ENOSPC; goto fail; } - err = lock_to_ceph_filelock(lock, &cephlock); + err = lock_to_ceph_filelock(lock, &flocks[l]); if (err) goto fail; - err = ceph_pagelist_append(pagelist, &cephlock, - sizeof(struct ceph_filelock)); + ++l; } - if (err) - goto fail; } fail: return err; } +/** + * Copy the encoded flock and fcntl locks into the pagelist. + * Format is: #fcntl locks, sequential fcntl locks, #flock locks, + * sequential flock locks. + * Returns zero on success. + */ +int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks) +{ + int err = 0; + __le32 nlocks; + + nlocks = cpu_to_le32(num_fcntl_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + err = ceph_pagelist_append(pagelist, flocks, + num_fcntl_locks * sizeof(*flocks)); + if (err) + goto out_fail; + + nlocks = cpu_to_le32(num_flock_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + err = ceph_pagelist_append(pagelist, + &flocks[num_fcntl_locks], + num_flock_locks * sizeof(*flocks)); +out_fail: + return err; +} + /* * Given a pointer to a lock, convert it to a ceph filelock */ diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 442880d099c9..187bf214444d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -265,7 +265,8 @@ static int parse_reply_info_extra(void **p, void *end, { if (info->head->op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); - else if (info->head->op == CEPH_MDS_OP_READDIR) + else if (info->head->op == CEPH_MDS_OP_READDIR || + info->head->op == CEPH_MDS_OP_LSSNAP) return parse_reply_info_dir(p, end, info, features); else if (info->head->op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features); @@ -364,9 +365,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s) atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); if (atomic_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) - s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer( - s->s_mdsc->fsc->client->monc.auth, - s->s_auth.authorizer); + ceph_auth_destroy_authorizer( + s->s_mdsc->fsc->client->monc.auth, + s->s_auth.authorizer); kfree(s); } } @@ -1196,6 +1197,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) session->s_trim_caps--; if (oissued) { /* we aren't the only cap.. just remove us */ + __queue_cap_release(session, ceph_ino(inode), cap->cap_id, + cap->mseq, cap->issue_seq); __ceph_remove_cap(cap); } else { /* try to drop referring dentries */ @@ -1388,6 +1391,7 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, num = le32_to_cpu(head->num); dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); session->s_num_cap_releases += num; /* requeue completed messages */ @@ -1550,7 +1554,7 @@ retry: *base = ceph_ino(temp->d_inode); *plen = len; dout("build_path on %p %d built %llx '%.*s'\n", - dentry, dentry->d_count, *base, len, path); + dentry, d_count(dentry), *base, len, path); return path; } @@ -1718,8 +1722,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - msg->pages = req->r_pages; - msg->nr_pages = req->r_num_pages; + if (req->r_data_len) { + /* outbound data set only by ceph_sync_setxattr() */ + BUG_ON(!req->r_pages); + ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0); + } + msg->hdr.data_len = cpu_to_le32(req->r_data_len); msg->hdr.data_off = cpu_to_le16(0); @@ -1913,6 +1921,7 @@ static void __wake_requests(struct ceph_mds_client *mdsc, req = list_entry(tmp_list.next, struct ceph_mds_request, r_wait); list_del_init(&req->r_wait); + dout(" wake request %p tid %llu\n", req, req->r_tid); __do_request(mdsc, req); } } @@ -2026,20 +2035,16 @@ out: } /* - * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS + * Invalidate dir's completeness, dentry lease state on an aborted MDS * namespace request. */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) { struct inode *inode = req->r_locked_dir; - struct ceph_inode_info *ci = ceph_inode(inode); - dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode); - spin_lock(&ci->i_ceph_lock); - ceph_dir_clear_complete(inode); - ci->i_release_count++; - spin_unlock(&ci->i_ceph_lock); + dout("invalidate_dir_request %p (complete, lease(s))\n", inode); + ceph_dir_clear_complete(inode); if (req->r_dentry) ceph_invalidate_dentry_lease(req->r_dentry); if (req->r_old_dentry) @@ -2450,6 +2455,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_lock(&ci->i_ceph_lock); cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ + cap->mseq = 0; /* and migrate_seq */ if (recon_state->flock) { rec.v2.cap_id = cpu_to_le64(cap->cap_id); @@ -2474,39 +2480,44 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (recon_state->flock) { int num_fcntl_locks, num_flock_locks; - struct ceph_pagelist_cursor trunc_point; - - ceph_pagelist_set_cursor(pagelist, &trunc_point); - do { - lock_flocks(); - ceph_count_locks(inode, &num_fcntl_locks, - &num_flock_locks); - rec.v2.flock_len = (2*sizeof(u32) + - (num_fcntl_locks+num_flock_locks) * - sizeof(struct ceph_filelock)); - unlock_flocks(); - - /* pre-alloc pagelist */ - ceph_pagelist_truncate(pagelist, &trunc_point); - err = ceph_pagelist_append(pagelist, &rec, reclen); - if (!err) - err = ceph_pagelist_reserve(pagelist, - rec.v2.flock_len); - - /* encode locks */ - if (!err) { - lock_flocks(); - err = ceph_encode_locks(inode, - pagelist, - num_fcntl_locks, - num_flock_locks); - unlock_flocks(); - } - } while (err == -ENOSPC); + struct ceph_filelock *flocks; + +encode_again: + spin_lock(&inode->i_lock); + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); + spin_unlock(&inode->i_lock); + flocks = kmalloc((num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock), GFP_NOFS); + if (!flocks) { + err = -ENOMEM; + goto out_free; + } + spin_lock(&inode->i_lock); + err = ceph_encode_locks_to_buffer(inode, flocks, + num_fcntl_locks, + num_flock_locks); + spin_unlock(&inode->i_lock); + if (err) { + kfree(flocks); + if (err == -ENOSPC) + goto encode_again; + goto out_free; + } + /* + * number of encoded locks is stable, so copy to pagelist + */ + rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + + (num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock)); + err = ceph_pagelist_append(pagelist, &rec, reclen); + if (!err) + err = ceph_locks_to_pagelist(flocks, pagelist, + num_fcntl_locks, + num_flock_locks); + kfree(flocks); } else { err = ceph_pagelist_append(pagelist, &rec, reclen); } - out_free: kfree(path); out_dput: @@ -2599,11 +2610,13 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, goto fail; } - reply->pagelist = pagelist; if (recon_state.flock) reply->hdr.version = cpu_to_le16(2); - reply->hdr.data_len = cpu_to_le32(pagelist->length); - reply->nr_pages = calc_pages_for(0, pagelist->length); + if (pagelist->length) { + /* set up outbound data if we have any */ + reply->hdr.data_len = cpu_to_le32(pagelist->length); + ceph_msg_data_add_pagelist(reply, pagelist); + } ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); @@ -3029,8 +3042,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) fsc->mdsc = mdsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); - if (mdsc->mdsmap == NULL) + if (mdsc->mdsmap == NULL) { + kfree(mdsc); return -ENOMEM; + } init_completion(&mdsc->safe_umount_waiters); init_waitqueue_head(&mdsc->session_close_wq); @@ -3433,13 +3448,17 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_auth_handshake *auth = &s->s_auth; if (force_new && auth->authorizer) { - if (ac->ops && ac->ops->destroy_authorizer) - ac->ops->destroy_authorizer(ac, auth->authorizer); + ceph_auth_destroy_authorizer(ac, auth->authorizer); auth->authorizer = NULL; } - if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { - int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, - auth); + if (!auth->authorizer) { + int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); + if (ret) + return ERR_PTR(ret); + } else { + int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); if (ret) return ERR_PTR(ret); } @@ -3455,7 +3474,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; - return ac->ops->verify_authorizer_reply(ac, s->s_auth.authorizer, len); + return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len); } static int invalidate_authorizer(struct ceph_connection *con) @@ -3464,12 +3483,32 @@ static int invalidate_authorizer(struct ceph_connection *con) struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; - if (ac->ops->invalidate_authorizer) - ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); + ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); return ceph_monc_validate_auth(&mdsc->fsc->client->monc); } +static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, int *skip) +{ + struct ceph_msg *msg; + int type = (int) le16_to_cpu(hdr->type); + int front_len = (int) le32_to_cpu(hdr->front_len); + + if (con->in_msg) + return con->in_msg; + + *skip = 0; + msg = ceph_msg_new(type, front_len, GFP_NOFS, false); + if (!msg) { + pr_err("unable to allocate msg type %d len %d\n", + type, front_len); + return NULL; + } + + return msg; +} + static const struct ceph_connection_operations mds_con_ops = { .get = con_get, .put = con_put, @@ -3478,6 +3517,7 @@ static const struct ceph_connection_operations mds_con_ops = { .verify_authorizer_reply = verify_authorizer_reply, .invalidate_authorizer = invalidate_authorizer, .peer_reset = peer_reset, + .alloc_msg = mds_alloc_msg, }; /* eof */ diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 0d3c9240c61b..132b64eeecd4 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -20,7 +20,10 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) { int n = 0; int i; - char r; + + /* special case for one mds */ + if (1 == m->m_max_mds && m->m_info[0].state > 0) + return 0; /* count */ for (i = 0; i < m->m_max_mds; i++) @@ -30,8 +33,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) return -1; /* pick */ - get_random_bytes(&r, 1); - n = r % n; + n = prandom_u32() % n; i = 0; for (i = 0; n > 0; i++, n--) while (m->m_info[i].state <= 0) @@ -90,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) u32 num_export_targets; void *pexport_targets = NULL; struct ceph_timespec laggy_since; + struct ceph_mds_info *info; ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); global_id = ceph_decode_64(p); @@ -124,24 +127,27 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) i+1, n, global_id, mds, inc, ceph_pr_addr(&addr.in_addr), ceph_mds_state_name(state)); - if (mds >= 0 && mds < m->m_max_mds && state > 0) { - m->m_info[mds].global_id = global_id; - m->m_info[mds].state = state; - m->m_info[mds].addr = addr; - m->m_info[mds].laggy = - (laggy_since.tv_sec != 0 || - laggy_since.tv_nsec != 0); - m->m_info[mds].num_export_targets = num_export_targets; - if (num_export_targets) { - m->m_info[mds].export_targets = - kcalloc(num_export_targets, sizeof(u32), - GFP_NOFS); - for (j = 0; j < num_export_targets; j++) - m->m_info[mds].export_targets[j] = - ceph_decode_32(&pexport_targets); - } else { - m->m_info[mds].export_targets = NULL; - } + + if (mds < 0 || mds >= m->m_max_mds || state <= 0) + continue; + + info = &m->m_info[mds]; + info->global_id = global_id; + info->state = state; + info->addr = addr; + info->laggy = (laggy_since.tv_sec != 0 || + laggy_since.tv_nsec != 0); + info->num_export_targets = num_export_targets; + if (num_export_targets) { + info->export_targets = kcalloc(num_export_targets, + sizeof(u32), GFP_NOFS); + if (info->export_targets == NULL) + goto badmem; + for (j = 0; j < num_export_targets; j++) + info->export_targets[j] = + ceph_decode_32(&pexport_targets); + } else { + info->export_targets = NULL; } } @@ -168,7 +174,7 @@ bad: DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); ceph_mdsmap_destroy(m); - return ERR_PTR(-EINVAL); + return ERR_PTR(err); } void ceph_mdsmap_destroy(struct ceph_mdsmap *m) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index cbb2f54a3019..f01645a27752 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -332,10 +332,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) err = -ENOMEM; if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) goto fail; - snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS); + snapc = ceph_create_snap_context(num, GFP_NOFS); if (!snapc) goto fail; - atomic_set(&snapc->nref, 1); /* build (reverse sorted) snap vector */ num = 0; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6ddc0bca56b2..6627b26a800c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -357,7 +357,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, } err = -EINVAL; dev_name_end--; /* back up to ':' separator */ - if (*dev_name_end != ':') { + if (dev_name_end < dev_name || *dev_name_end != ':') { pr_err("device name is missing path (no : separator in %s)\n", dev_name); goto out; @@ -479,6 +479,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH; const unsigned required_features = 0; + int page_count; + size_t size; int err = -ENOMEM; fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); @@ -522,8 +524,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, /* set up mempools */ err = -ENOMEM; - fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, - fsc->mount_options->wsize >> PAGE_CACHE_SHIFT); + page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT; + size = sizeof (struct page *) * (page_count ? page_count : 1); + fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size); if (!fsc->wb_pagevec_pool) goto fail_trunc_wq; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c7b309723dcc..cbded572345e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -204,7 +204,6 @@ struct ceph_inode_xattr { * Ceph dentry state */ struct ceph_dentry_info { - unsigned long flags; struct ceph_mds_session *lease_session; u32 lease_gen, lease_shared_gen; u32 lease_seq; @@ -215,18 +214,6 @@ struct ceph_dentry_info { u64 offset; }; -/* - * dentry flags - * - * The locking for D_COMPLETE is a bit odd: - * - we can clear it at almost any time (see ceph_d_prune) - * - it is only meaningful if: - * - we hold dir inode i_ceph_lock - * - we hold dir FILE_SHARED caps - * - the dentry D_COMPLETE is set - */ -#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */ - struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -257,7 +244,8 @@ struct ceph_inode_info { u32 i_time_warp_seq; unsigned i_ceph_flags; - unsigned long i_release_count; + atomic_t i_release_count; + atomic_t i_complete_count; struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; @@ -267,7 +255,7 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */ + u64 i_max_offset; /* largest readdir offset, set with complete dir */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; @@ -436,33 +424,31 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ #define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ -static inline void ceph_i_clear(struct inode *inode, unsigned mask) +static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, + int release_count) { - struct ceph_inode_info *ci = ceph_inode(inode); - - spin_lock(&ci->i_ceph_lock); - ci->i_ceph_flags &= ~mask; - spin_unlock(&ci->i_ceph_lock); + atomic_set(&ci->i_complete_count, release_count); } -static inline void ceph_i_set(struct inode *inode, unsigned mask) +static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) { - struct ceph_inode_info *ci = ceph_inode(inode); + atomic_inc(&ci->i_release_count); +} - spin_lock(&ci->i_ceph_lock); - ci->i_ceph_flags |= mask; - spin_unlock(&ci->i_ceph_lock); +static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) +{ + return atomic_read(&ci->i_complete_count) == + atomic_read(&ci->i_release_count); } -static inline bool ceph_i_test(struct inode *inode, unsigned mask) +static inline void ceph_dir_clear_complete(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - bool r; + __ceph_dir_clear_complete(ceph_inode(inode)); +} - spin_lock(&ci->i_ceph_lock); - r = (ci->i_ceph_flags & mask) == mask; - spin_unlock(&ci->i_ceph_lock); - return r; +static inline bool ceph_dir_is_complete(struct inode *inode) +{ + return __ceph_dir_is_complete(ceph_inode(inode)); } @@ -489,13 +475,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) } /* - * set/clear directory D_COMPLETE flag - */ -void ceph_dir_set_complete(struct inode *inode); -void ceph_dir_clear_complete(struct inode *inode); -bool ceph_dir_test_complete(struct inode *inode); - -/* * caps helpers */ static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) @@ -555,7 +534,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); -extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, +extern void ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); @@ -584,7 +563,7 @@ struct ceph_file_info { u64 next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ struct dentry *dentry; /* next dentry (for dcache readdir) */ - unsigned long dir_release_count; + int dir_release_count; /* used for -o dirstat read() on directory thing */ char *dir_info; @@ -755,6 +734,8 @@ static inline void ceph_remove_cap(struct ceph_cap *cap) extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); +extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, + u64 cap_id, u32 migrate_seq, u32 issue_seq); extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, loff_t start, loff_t end, @@ -841,8 +822,13 @@ extern const struct export_operations ceph_export_ops; extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); -extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, - int p_locks, int f_locks); +extern int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, + int num_flock_locks); +extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks); extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); /* debugfs.c */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 9b6b2b6dd164..be661d8f532a 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -675,17 +675,18 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, if (!ceph_is_valid_xattr(name)) return -ENODATA; - spin_lock(&ci->i_ceph_lock); - dout("getxattr %p ver=%lld index_ver=%lld\n", inode, - ci->i_xattrs.version, ci->i_xattrs.index_version); /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { err = vxattr->getxattr_cb(ci, value, size); - goto out; + return err; } + spin_lock(&ci->i_ceph_lock); + dout("getxattr %p ver=%lld index_ver=%lld\n", inode, + ci->i_xattrs.version, ci->i_xattrs.index_version); + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto get_xattr; diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 2906ee276408..603f18a65c12 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -10,6 +10,7 @@ config CIFS select CRYPTO_ECB select CRYPTO_DES select CRYPTO_SHA256 + select CRYPTO_CMAC help This is the client VFS module for the Common Internet File System (CIFS) protocol which is the successor to the Server Message Block diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index 1d36db114772..a3b56544c21b 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -506,11 +506,11 @@ decode_negTokenInit(unsigned char *security_blob, int length, /* GSSAPI header */ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding negTokenInit header"); + cifs_dbg(FYI, "Error decoding negTokenInit header\n"); return 0; } else if ((cls != ASN1_APL) || (con != ASN1_CON) || (tag != ASN1_EOC)) { - cFYI(1, "cls = %d con = %d tag = %d", cls, con, tag); + cifs_dbg(FYI, "cls = %d con = %d tag = %d\n", cls, con, tag); return 0; } @@ -531,52 +531,52 @@ decode_negTokenInit(unsigned char *security_blob, int length, /* SPNEGO OID not present or garbled -- bail out */ if (!rc) { - cFYI(1, "Error decoding negTokenInit header"); + cifs_dbg(FYI, "Error decoding negTokenInit header\n"); return 0; } /* SPNEGO */ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding negTokenInit"); + cifs_dbg(FYI, "Error decoding negTokenInit\n"); return 0; } else if ((cls != ASN1_CTX) || (con != ASN1_CON) || (tag != ASN1_EOC)) { - cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n", + cls, con, tag, end, *end); return 0; } /* negTokenInit */ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding negTokenInit"); + cifs_dbg(FYI, "Error decoding negTokenInit\n"); return 0; } else if ((cls != ASN1_UNI) || (con != ASN1_CON) || (tag != ASN1_SEQ)) { - cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n", + cls, con, tag, end, *end); return 0; } /* sequence */ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding 2nd part of negTokenInit"); + cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n"); return 0; } else if ((cls != ASN1_CTX) || (con != ASN1_CON) || (tag != ASN1_EOC)) { - cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n", + cls, con, tag, end, *end); return 0; } /* sequence of */ if (asn1_header_decode (&ctx, &sequence_end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding 2nd part of negTokenInit"); + cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n"); return 0; } else if ((cls != ASN1_UNI) || (con != ASN1_CON) || (tag != ASN1_SEQ)) { - cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n", + cls, con, tag, end, *end); return 0; } @@ -584,15 +584,15 @@ decode_negTokenInit(unsigned char *security_blob, int length, while (!asn1_eoc_decode(&ctx, sequence_end)) { rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); if (!rc) { - cFYI(1, "Error decoding negTokenInit hdr exit2"); + cifs_dbg(FYI, "Error decoding negTokenInit hdr exit2\n"); return 0; } if ((tag == ASN1_OJI) && (con == ASN1_PRI)) { if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) { - cFYI(1, "OID len = %d oid = 0x%lx 0x%lx " - "0x%lx 0x%lx", oidlen, *oid, - *(oid + 1), *(oid + 2), *(oid + 3)); + cifs_dbg(FYI, "OID len = %d oid = 0x%lx 0x%lx 0x%lx 0x%lx\n", + oidlen, *oid, *(oid + 1), *(oid + 2), + *(oid + 3)); if (compare_oid(oid, oidlen, MSKRB5_OID, MSKRB5_OID_LEN)) @@ -610,7 +610,7 @@ decode_negTokenInit(unsigned char *security_blob, int length, kfree(oid); } } else { - cFYI(1, "Should be an oid what is going on?"); + cifs_dbg(FYI, "Should be an oid what is going on?\n"); } } diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index 282d6de7e410..6c665bf4a27c 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -92,7 +92,7 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data, break; default: - cERROR(1, "Unknown network family '%d'", sa->sa_family); + cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family); key_len = 0; break; } @@ -152,7 +152,7 @@ static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer, sharename = extract_sharename(tcon->treeName); if (IS_ERR(sharename)) { - cFYI(1, "%s: couldn't extract sharename", __func__); + cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); sharename = NULL; return 0; } @@ -302,7 +302,7 @@ static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data) pagevec_init(&pvec, 0); first = 0; - cFYI(1, "%s: cifs inode 0x%p now uncached", __func__, cifsi); + cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi); for (;;) { nr_pages = pagevec_lookup(&pvec, diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index d9ea6ede6a7a..f3ac4154cbb6 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -57,15 +57,32 @@ cifs_dump_mem(char *label, void *data, int length) } } +#ifdef CONFIG_CIFS_DEBUG +void cifs_vfs_err(const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_ERR "CIFS VFS: %pV", &vaf); + + va_end(args); +} +#endif + void cifs_dump_detail(void *buf) { #ifdef CONFIG_CIFS_DEBUG2 struct smb_hdr *smb = (struct smb_hdr *)buf; - cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d", - smb->Command, smb->Status.CifsError, - smb->Flags, smb->Flags2, smb->Mid, smb->Pid); - cERROR(1, "smb buf %p len %u", smb, smbCalcSize(smb)); + cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n", + smb->Command, smb->Status.CifsError, + smb->Flags, smb->Flags2, smb->Mid, smb->Pid); + cifs_dbg(VFS, "smb buf %p len %u\n", smb, smbCalcSize(smb)); #endif /* CONFIG_CIFS_DEBUG2 */ } @@ -78,25 +95,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server) if (server == NULL) return; - cERROR(1, "Dump pending requests:"); + cifs_dbg(VFS, "Dump pending requests:\n"); spin_lock(&GlobalMid_Lock); list_for_each(tmp, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu", - mid_entry->mid_state, - le16_to_cpu(mid_entry->command), - mid_entry->pid, - mid_entry->callback_data, - mid_entry->mid); + cifs_dbg(VFS, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu\n", + mid_entry->mid_state, + le16_to_cpu(mid_entry->command), + mid_entry->pid, + mid_entry->callback_data, + mid_entry->mid); #ifdef CONFIG_CIFS_STATS2 - cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld", - mid_entry->large_buf, - mid_entry->resp_buf, - mid_entry->when_received, - jiffies); + cifs_dbg(VFS, "IsLarge: %d buf: %p time rcv: %ld now: %ld\n", + mid_entry->large_buf, + mid_entry->resp_buf, + mid_entry->when_received, + jiffies); #endif /* STATS2 */ - cERROR(1, "IsMult: %d IsEnd: %d", mid_entry->multiRsp, - mid_entry->multiEnd); + cifs_dbg(VFS, "IsMult: %d IsEnd: %d\n", + mid_entry->multiRsp, mid_entry->multiEnd); if (mid_entry->resp_buf) { cifs_dump_detail(mid_entry->resp_buf); cifs_dump_mem("existing buf: ", @@ -196,7 +213,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) tcon->nativeFileSystem); } seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x" - "\nPathComponentMax: %d Status: 0x%d", + "\n\tPathComponentMax: %d Status: 0x%d", le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), le32_to_cpu(tcon->fsAttrInfo.Attributes), le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), @@ -207,6 +224,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_puts(m, " type: CDROM "); else seq_printf(m, " type: %d ", dev_type); + if (server->ops->dump_share_caps) + server->ops->dump_share_caps(m, tcon); if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); @@ -578,9 +597,36 @@ static int cifs_security_flags_proc_open(struct inode *inode, struct file *file) return single_open(file, cifs_security_flags_proc_show, NULL); } +/* + * Ensure that if someone sets a MUST flag, that we disable all other MAY + * flags except for the ones corresponding to the given MUST flag. If there are + * multiple MUST flags, then try to prefer more secure ones. + */ +static void +cifs_security_flags_handle_must_flags(unsigned int *flags) +{ + unsigned int signflags = *flags & CIFSSEC_MUST_SIGN; + + if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) + *flags = CIFSSEC_MUST_KRB5; + else if ((*flags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP) + *flags = CIFSSEC_MUST_NTLMSSP; + else if ((*flags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2) + *flags = CIFSSEC_MUST_NTLMV2; + else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM) + *flags = CIFSSEC_MUST_NTLM; + else if ((*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN) + *flags = CIFSSEC_MUST_LANMAN; + else if ((*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT) + *flags = CIFSSEC_MUST_PLNTXT; + + *flags |= signflags; +} + static ssize_t cifs_security_flags_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { + int rc; unsigned int flags; char flags_string[12]; char c; @@ -603,34 +649,43 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, global_secflags = CIFSSEC_MAX; return count; } else if (!isdigit(c)) { - cERROR(1, "invalid flag %c", c); + cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", + flags_string); return -EINVAL; } } - /* else we have a number */ - flags = simple_strtoul(flags_string, NULL, 0); + /* else we have a number */ + rc = kstrtouint(flags_string, 0, &flags); + if (rc) { + cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", + flags_string); + return rc; + } - cFYI(1, "sec flags 0x%x", flags); + cifs_dbg(FYI, "sec flags 0x%x\n", flags); - if (flags <= 0) { - cERROR(1, "invalid security flags %s", flags_string); + if (flags == 0) { + cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string); return -EINVAL; } if (flags & ~CIFSSEC_MASK) { - cERROR(1, "attempt to set unsupported security flags 0x%x", - flags & ~CIFSSEC_MASK); + cifs_dbg(VFS, "Unsupported security flags: 0x%x\n", + flags & ~CIFSSEC_MASK); return -EINVAL; } + + cifs_security_flags_handle_must_flags(&flags); + /* flags look ok - update the global security flags for cifs module */ global_secflags = flags; if (global_secflags & CIFSSEC_MUST_SIGN) { /* requiring signing implies signing is allowed */ global_secflags |= CIFSSEC_MAY_SIGN; - cFYI(1, "packet signing now required"); + cifs_dbg(FYI, "packet signing now required\n"); } else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) { - cFYI(1, "packet signing disabled"); + cifs_dbg(FYI, "packet signing disabled\n"); } /* BB should we turn on MAY flags for other MUST options? */ return count; diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index 69ae3d3c3b31..c99b40fb609b 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -25,18 +25,20 @@ void cifs_dump_mem(char *label, void *data, int length); void cifs_dump_detail(void *); void cifs_dump_mids(struct TCP_Server_Info *); -#ifdef CONFIG_CIFS_DEBUG2 -#define DBG2 2 -#else -#define DBG2 0 -#endif extern int traceSMB; /* flag which enables the function below */ void dump_smb(void *, int); #define CIFS_INFO 0x01 #define CIFS_RC 0x02 #define CIFS_TIMER 0x04 +#define VFS 1 +#define FYI 2 extern int cifsFYI; +#ifdef CONFIG_CIFS_DEBUG2 +#define NOISY 4 +#else +#define NOISY 0 +#endif /* * debug ON @@ -44,31 +46,21 @@ extern int cifsFYI; */ #ifdef CONFIG_CIFS_DEBUG -/* information message: e.g., configuration, major event */ -#define cifsfyi(fmt, ...) \ -do { \ - if (cifsFYI & CIFS_INFO) \ - printk(KERN_DEBUG "%s: " fmt "\n", \ - __FILE__, ##__VA_ARGS__); \ -} while (0) - -#define cFYI(set, fmt, ...) \ -do { \ - if (set) \ - cifsfyi(fmt, ##__VA_ARGS__); \ -} while (0) +__printf(1, 2) void cifs_vfs_err(const char *fmt, ...); -#define cifswarn(fmt, ...) \ - printk(KERN_WARNING fmt "\n", ##__VA_ARGS__) - -/* error event message: e.g., i/o error */ -#define cifserror(fmt, ...) \ - printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \ - -#define cERROR(set, fmt, ...) \ +/* information message: e.g., configuration, major event */ +#define cifs_dbg(type, fmt, ...) \ do { \ - if (set) \ - cifserror(fmt, ##__VA_ARGS__); \ + if (type == FYI) { \ + if (cifsFYI & CIFS_INFO) { \ + printk(KERN_DEBUG "%s: " fmt, \ + __FILE__, ##__VA_ARGS__); \ + } \ + } else if (type == VFS) { \ + cifs_vfs_err(fmt, ##__VA_ARGS__); \ + } else if (type == NOISY && type != 0) { \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ + } \ } while (0) /* @@ -76,27 +68,11 @@ do { \ * --------- */ #else /* _CIFS_DEBUG */ -#define cifsfyi(fmt, ...) \ +#define cifs_dbg(type, fmt, ...) \ do { \ if (0) \ - printk(KERN_DEBUG "%s: " fmt "\n", \ - __FILE__, ##__VA_ARGS__); \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ } while (0) -#define cFYI(set, fmt, ...) \ -do { \ - if (0 && set) \ - cifsfyi(fmt, ##__VA_ARGS__); \ -} while (0) -#define cifserror(fmt, ...) \ -do { \ - if (0) \ - printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \ -} while (0) -#define cERROR(set, fmt, ...) \ -do { \ - if (0 && set) \ - cifserror(fmt, ##__VA_ARGS__); \ -} while (0) -#endif /* _CIFS_DEBUG */ +#endif #endif /* _H_CIFS_DEBUG */ diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 210fce2df308..58df174deb10 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/vfs.h> #include <linux/fs.h> +#include <linux/inet.h> #include "cifsglob.h" #include "cifsproto.h" #include "cifsfs.h" @@ -48,58 +49,74 @@ void cifs_dfs_release_automount_timer(void) } /** - * cifs_get_share_name - extracts share name from UNC - * @node_name: pointer to UNC string + * cifs_build_devname - build a devicename from a UNC and optional prepath + * @nodename: pointer to UNC string + * @prepath: pointer to prefixpath (or NULL if there isn't one) * - * Extracts sharename form full UNC. - * i.e. strips from UNC trailing path that is not part of share - * name and fixup missing '\' in the beginning of DFS node refferal - * if necessary. - * Returns pointer to share name on success or ERR_PTR on error. - * Caller is responsible for freeing returned string. + * Build a new cifs devicename after chasing a DFS referral. Allocate a buffer + * big enough to hold the final thing. Copy the UNC from the nodename, and + * concatenate the prepath onto the end of it if there is one. + * + * Returns pointer to the built string, or a ERR_PTR. Caller is responsible + * for freeing the returned string. */ -static char *cifs_get_share_name(const char *node_name) +static char * +cifs_build_devname(char *nodename, const char *prepath) { - int len; - char *UNC; - char *pSep; - - len = strlen(node_name); - UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */, - GFP_KERNEL); - if (!UNC) - return ERR_PTR(-ENOMEM); + size_t pplen; + size_t unclen; + char *dev; + char *pos; + + /* skip over any preceding delimiters */ + nodename += strspn(nodename, "\\"); + if (!*nodename) + return ERR_PTR(-EINVAL); - /* get share name and server name */ - if (node_name[1] != '\\') { - UNC[0] = '\\'; - strncpy(UNC+1, node_name, len); - len++; - UNC[len] = 0; - } else { - strncpy(UNC, node_name, len); - UNC[len] = 0; - } + /* get length of UNC and set pos to last char */ + unclen = strlen(nodename); + pos = nodename + unclen - 1; - /* find server name end */ - pSep = memchr(UNC+2, '\\', len-2); - if (!pSep) { - cERROR(1, "%s: no server name end in node name: %s", - __func__, node_name); - kfree(UNC); - return ERR_PTR(-EINVAL); + /* trim off any trailing delimiters */ + while (*pos == '\\') { + --pos; + --unclen; } - /* find sharename end */ - pSep++; - pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC)); - if (pSep) { - /* trim path up to sharename end - * now we have share name in UNC */ - *pSep = 0; + /* allocate a buffer: + * +2 for preceding "//" + * +1 for delimiter between UNC and prepath + * +1 for trailing NULL + */ + pplen = prepath ? strlen(prepath) : 0; + dev = kmalloc(2 + unclen + 1 + pplen + 1, GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + pos = dev; + /* add the initial "//" */ + *pos = '/'; + ++pos; + *pos = '/'; + ++pos; + + /* copy in the UNC portion from referral */ + memcpy(pos, nodename, unclen); + pos += unclen; + + /* copy the prefixpath remainder (if there is one) */ + if (pplen) { + *pos = '/'; + ++pos; + memcpy(pos, prepath, pplen); + pos += pplen; } - return UNC; + /* NULL terminator */ + *pos = '\0'; + + convert_delimiter(dev, '/'); + return dev; } @@ -123,6 +140,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, { int rc; char *mountdata = NULL; + const char *prepath = NULL; int md_len; char *tkn_e; char *srvIP = NULL; @@ -132,7 +150,10 @@ char *cifs_compose_mount_options(const char *sb_mountdata, if (sb_mountdata == NULL) return ERR_PTR(-EINVAL); - *devname = cifs_get_share_name(ref->node_name); + if (strlen(fullpath) - ref->path_consumed) + prepath = fullpath + ref->path_consumed; + + *devname = cifs_build_devname(ref->node_name, prepath); if (IS_ERR(*devname)) { rc = PTR_ERR(*devname); *devname = NULL; @@ -141,17 +162,19 @@ char *cifs_compose_mount_options(const char *sb_mountdata, rc = dns_resolve_server_name_to_ip(*devname, &srvIP); if (rc < 0) { - cFYI(1, "%s: Failed to resolve server part of %s to IP: %d", - __func__, *devname, rc); + cifs_dbg(FYI, "%s: Failed to resolve server part of %s to IP: %d\n", + __func__, *devname, rc); goto compose_mount_options_err; } - /* md_len = strlen(...) + 12 for 'sep+prefixpath=' - * assuming that we have 'unc=' and 'ip=' in - * the original sb_mountdata + /* + * In most cases, we'll be building a shorter string than the original, + * but we do have to assume that the address in the ip= option may be + * much longer than the original. Add the max length of an address + * string to the length of the original string to allow for worst case. */ - md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12; - mountdata = kzalloc(md_len+1, GFP_KERNEL); + md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN; + mountdata = kzalloc(md_len + 1, GFP_KERNEL); if (mountdata == NULL) { rc = -ENOMEM; goto compose_mount_options_err; @@ -195,29 +218,9 @@ char *cifs_compose_mount_options(const char *sb_mountdata, strncat(mountdata, &sep, 1); strcat(mountdata, "ip="); strcat(mountdata, srvIP); - strncat(mountdata, &sep, 1); - strcat(mountdata, "unc="); - strcat(mountdata, *devname); - - /* find & copy prefixpath */ - tkn_e = strchr(ref->node_name + 2, '\\'); - if (tkn_e == NULL) { - /* invalid unc, missing share name*/ - rc = -EINVAL; - goto compose_mount_options_err; - } - - tkn_e = strchr(tkn_e + 1, '\\'); - if (tkn_e || (strlen(fullpath) - ref->path_consumed)) { - strncat(mountdata, &sep, 1); - strcat(mountdata, "prefixpath="); - if (tkn_e) - strcat(mountdata, tkn_e + 1); - strcat(mountdata, fullpath + ref->path_consumed); - } - /*cFYI(1, "%s: parent mountdata: %s", __func__,sb_mountdata);*/ - /*cFYI(1, "%s: submount mountdata: %s", __func__, mountdata );*/ + /*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/ + /*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/ compose_mount_options_out: kfree(srvIP); @@ -260,11 +263,12 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, static void dump_referral(const struct dfs_info3_param *ref) { - cFYI(1, "DFS: ref path: %s", ref->path_name); - cFYI(1, "DFS: node path: %s", ref->node_name); - cFYI(1, "DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type); - cFYI(1, "DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag, - ref->path_consumed); + cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name); + cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name); + cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n", + ref->flags, ref->server_type); + cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n", + ref->ref_flag, ref->path_consumed); } /* @@ -283,7 +287,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) struct vfsmount *mnt; struct tcon_link *tlink; - cFYI(1, "in %s", __func__); + cifs_dbg(FYI, "in %s\n", __func__); BUG_ON(IS_ROOT(mntpt)); /* @@ -320,15 +324,15 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) /* connect to a node */ len = strlen(referrals[i].node_name); if (len < 2) { - cERROR(1, "%s: Net Address path too short: %s", - __func__, referrals[i].node_name); + cifs_dbg(VFS, "%s: Net Address path too short: %s\n", + __func__, referrals[i].node_name); mnt = ERR_PTR(-EINVAL); break; } mnt = cifs_dfs_do_refmount(cifs_sb, full_path, referrals + i); - cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, - referrals[i].node_name, mnt); + cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", + __func__, referrals[i].node_name, mnt); if (!IS_ERR(mnt)) goto success; } @@ -343,7 +347,7 @@ success: free_full_path: kfree(full_path); cdda_exit: - cFYI(1, "leaving %s" , __func__); + cifs_dbg(FYI, "leaving %s\n" , __func__); return mnt; } @@ -354,11 +358,11 @@ struct vfsmount *cifs_dfs_d_automount(struct path *path) { struct vfsmount *newmnt; - cFYI(1, "in %s", __func__); + cifs_dbg(FYI, "in %s\n", __func__); newmnt = cifs_dfs_do_automount(path->dentry); if (IS_ERR(newmnt)) { - cFYI(1, "leaving %s [automount failed]" , __func__); + cifs_dbg(FYI, "leaving %s [automount failed]\n" , __func__); return newmnt; } @@ -366,7 +370,7 @@ struct vfsmount *cifs_dfs_d_automount(struct path *path) mnt_set_expiry(newmnt, &cifs_dfs_automount_list); schedule_delayed_work(&cifs_dfs_automount_task, cifs_dfs_mountpoint_expiry_timeout); - cFYI(1, "leaving %s [ok]" , __func__); + cifs_dbg(FYI, "leaving %s [ok]\n" , __func__); return newmnt; } diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 10e774761299..a3e932547617 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -37,12 +37,11 @@ cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep) int ret; ret = -ENOMEM; - payload = kmalloc(prep->datalen, GFP_KERNEL); + payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL); if (!payload) goto error; /* attach the data */ - memcpy(payload, prep->data, prep->datalen); key->payload.data = payload; ret = 0; @@ -164,7 +163,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) dp = description + strlen(description); sprintf(dp, ";pid=0x%x", current->pid); - cFYI(1, "key description = %s", description); + cifs_dbg(FYI, "key description = %s\n", description); spnego_key = request_key(&cifs_spnego_key_type, description, ""); #ifdef CONFIG_CIFS_DEBUG2 diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 71d5d0a5f6b2..0227b45ef00a 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -227,8 +227,8 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len, for (i = 0; len && *from; i++, from += charlen, len -= charlen) { charlen = codepage->char2uni(from, len, &wchar_to); if (charlen < 1) { - cERROR(1, "strtoUTF16: char2uni of 0x%x returned %d", - *from, charlen); + cifs_dbg(VFS, "strtoUTF16: char2uni of 0x%x returned %d\n", + *from, charlen); /* A question mark */ wchar_to = 0x003f; charlen = 1; diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index 4fb097468e21..fe8d6276410a 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -327,14 +327,14 @@ UniToupper(register wchar_t uc) /* * UniStrupr: Upper case a unicode string */ -static inline wchar_t * -UniStrupr(register wchar_t *upin) +static inline __le16 * +UniStrupr(register __le16 *upin) { - register wchar_t *up; + register __le16 *up; up = upin; while (*up) { /* For all characters */ - *up = UniToupper(*up); + *up = cpu_to_le16(UniToupper(le16_to_cpu(*up))); up++; } return upin; /* Return input pointer */ diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index f1e3f25fe004..51f5e0ee7237 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -63,11 +63,10 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) key->datalen = prep->datalen; return 0; } - payload = kmalloc(prep->datalen, GFP_KERNEL); + payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL); if (!payload) return -ENOMEM; - memcpy(payload, prep->data, prep->datalen); key->payload.data = payload; key->datalen = prep->datalen; return 0; @@ -219,13 +218,13 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) sidkey = request_key(&cifs_idmap_key_type, desc, ""); if (IS_ERR(sidkey)) { rc = -EINVAL; - cFYI(1, "%s: Can't map %cid %u to a SID", __func__, - sidtype == SIDOWNER ? 'u' : 'g', cid); + cifs_dbg(FYI, "%s: Can't map %cid %u to a SID\n", + __func__, sidtype == SIDOWNER ? 'u' : 'g', cid); goto out_revert_creds; } else if (sidkey->datalen < CIFS_SID_BASE_SIZE) { rc = -EIO; - cFYI(1, "%s: Downcall contained malformed key " - "(datalen=%hu)", __func__, sidkey->datalen); + cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n", + __func__, sidkey->datalen); goto invalidate_key; } @@ -241,8 +240,8 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32)); if (ksid_size > sidkey->datalen) { rc = -EIO; - cFYI(1, "%s: Downcall contained malformed key (datalen=%hu, " - "ksid_size=%u)", __func__, sidkey->datalen, ksid_size); + cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu, ksid_size=%u)\n", + __func__, sidkey->datalen, ksid_size); goto invalidate_key; } @@ -274,8 +273,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, * Just return an error. */ if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) { - cFYI(1, "%s: %u subauthorities is too many!", __func__, - psid->num_subauth); + cifs_dbg(FYI, "%s: %u subauthorities is too many!\n", + __func__, psid->num_subauth); return -EIO; } @@ -287,8 +286,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, sidkey = request_key(&cifs_idmap_key_type, sidstr, ""); if (IS_ERR(sidkey)) { rc = -EINVAL; - cFYI(1, "%s: Can't map SID %s to a %cid", __func__, sidstr, - sidtype == SIDOWNER ? 'u' : 'g'); + cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n", + __func__, sidstr, sidtype == SIDOWNER ? 'u' : 'g'); goto out_revert_creds; } @@ -300,8 +299,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); if (sidkey->datalen != sizeof(uid_t)) { rc = -EIO; - cFYI(1, "%s: Downcall contained malformed key " - "(datalen=%hu)", __func__, sidkey->datalen); + cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n", + __func__, sidkey->datalen); key_invalidate(sidkey); goto out_key_put; } @@ -346,7 +345,8 @@ init_cifs_idmap(void) struct key *keyring; int ret; - cFYI(1, "Registering the %s key type", cifs_idmap_key_type.name); + cifs_dbg(FYI, "Registering the %s key type\n", + cifs_idmap_key_type.name); /* create an override credential set with a special thread keyring in * which requests are cached @@ -379,7 +379,7 @@ init_cifs_idmap(void) cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; root_cred = cred; - cFYI(1, "cifs idmap keyring: %d", key_serial(keyring)); + cifs_dbg(FYI, "cifs idmap keyring: %d\n", key_serial(keyring)); return 0; failed_put_key: @@ -395,7 +395,7 @@ exit_cifs_idmap(void) key_revoke(root_cred->thread_keyring); unregister_key_type(&cifs_idmap_key_type); put_cred(root_cred); - cFYI(1, "Unregistered %s key type", cifs_idmap_key_type.name); + cifs_dbg(FYI, "Unregistered %s key type\n", cifs_idmap_key_type.name); } /* copy ntsd, owner sid, and group sid from a security descriptor to another */ @@ -462,14 +462,14 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode, *pbits_to_set &= ~S_IXUGO; return; } else if (type != ACCESS_ALLOWED) { - cERROR(1, "unknown access control type %d", type); + cifs_dbg(VFS, "unknown access control type %d\n", type); return; } /* else ACCESS_ALLOWED type */ if (flags & GENERIC_ALL) { *pmode |= (S_IRWXUGO & (*pbits_to_set)); - cFYI(DBG2, "all perms"); + cifs_dbg(NOISY, "all perms\n"); return; } if ((flags & GENERIC_WRITE) || @@ -482,7 +482,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode, ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS)) *pmode |= (S_IXUGO & (*pbits_to_set)); - cFYI(DBG2, "access flags 0x%x mode now 0x%x", flags, *pmode); + cifs_dbg(NOISY, "access flags 0x%x mode now 0x%x\n", flags, *pmode); return; } @@ -511,7 +511,8 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use, if (mode & S_IXUGO) *pace_flags |= SET_FILE_EXEC_RIGHTS; - cFYI(DBG2, "mode: 0x%x, access flags now 0x%x", mode, *pace_flags); + cifs_dbg(NOISY, "mode: 0x%x, access flags now 0x%x\n", + mode, *pace_flags); return; } @@ -551,24 +552,24 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl) /* validate that we do not go past end of acl */ if (le16_to_cpu(pace->size) < 16) { - cERROR(1, "ACE too small %d", le16_to_cpu(pace->size)); + cifs_dbg(VFS, "ACE too small %d\n", le16_to_cpu(pace->size)); return; } if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) { - cERROR(1, "ACL too small to parse ACE"); + cifs_dbg(VFS, "ACL too small to parse ACE\n"); return; } num_subauth = pace->sid.num_subauth; if (num_subauth) { int i; - cFYI(1, "ACE revision %d num_auth %d type %d flags %d size %d", - pace->sid.revision, pace->sid.num_subauth, pace->type, - pace->flags, le16_to_cpu(pace->size)); + cifs_dbg(FYI, "ACE revision %d num_auth %d type %d flags %d size %d\n", + pace->sid.revision, pace->sid.num_subauth, pace->type, + pace->flags, le16_to_cpu(pace->size)); for (i = 0; i < num_subauth; ++i) { - cFYI(1, "ACE sub_auth[%d]: 0x%x", i, - le32_to_cpu(pace->sid.sub_auth[i])); + cifs_dbg(FYI, "ACE sub_auth[%d]: 0x%x\n", + i, le32_to_cpu(pace->sid.sub_auth[i])); } /* BB add length check to make sure that we do not have huge @@ -601,13 +602,13 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, /* validate that we do not go past end of acl */ if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { - cERROR(1, "ACL too small to parse DACL"); + cifs_dbg(VFS, "ACL too small to parse DACL\n"); return; } - cFYI(DBG2, "DACL revision %d size %d num aces %d", - le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size), - le32_to_cpu(pdacl->num_aces)); + cifs_dbg(NOISY, "DACL revision %d size %d num aces %d\n", + le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size), + le32_to_cpu(pdacl->num_aces)); /* reset rwx permissions for user/group/other. Also, if num_aces is 0 i.e. DACL has no ACEs, @@ -627,10 +628,8 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, return; ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), GFP_KERNEL); - if (!ppace) { - cERROR(1, "DACL memory allocation error"); + if (!ppace) return; - } for (i = 0; i < num_aces; ++i) { ppace[i] = (struct cifs_ace *) (acl_base + acl_size); @@ -703,25 +702,25 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl) /* validate that we do not go past end of ACL - sid must be at least 8 bytes long (assuming no sub-auths - e.g. the null SID */ if (end_of_acl < (char *)psid + 8) { - cERROR(1, "ACL too small to parse SID %p", psid); + cifs_dbg(VFS, "ACL too small to parse SID %p\n", psid); return -EINVAL; } #ifdef CONFIG_CIFS_DEBUG2 if (psid->num_subauth) { int i; - cFYI(1, "SID revision %d num_auth %d", - psid->revision, psid->num_subauth); + cifs_dbg(FYI, "SID revision %d num_auth %d\n", + psid->revision, psid->num_subauth); for (i = 0; i < psid->num_subauth; i++) { - cFYI(1, "SID sub_auth[%d]: 0x%x ", i, - le32_to_cpu(psid->sub_auth[i])); + cifs_dbg(FYI, "SID sub_auth[%d]: 0x%x\n", + i, le32_to_cpu(psid->sub_auth[i])); } /* BB add length check to make sure that we do not have huge num auths and therefore go off the end */ - cFYI(1, "RID 0x%x", - le32_to_cpu(psid->sub_auth[psid->num_subauth-1])); + cifs_dbg(FYI, "RID 0x%x\n", + le32_to_cpu(psid->sub_auth[psid->num_subauth-1])); } #endif @@ -748,31 +747,33 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, le32_to_cpu(pntsd->gsidoffset)); dacloffset = le32_to_cpu(pntsd->dacloffset); dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); - cFYI(DBG2, "revision %d type 0x%x ooffset 0x%x goffset 0x%x " - "sacloffset 0x%x dacloffset 0x%x", + cifs_dbg(NOISY, "revision %d type 0x%x ooffset 0x%x goffset 0x%x sacloffset 0x%x dacloffset 0x%x\n", pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset), le32_to_cpu(pntsd->gsidoffset), le32_to_cpu(pntsd->sacloffset), dacloffset); /* cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */ rc = parse_sid(owner_sid_ptr, end_of_acl); if (rc) { - cFYI(1, "%s: Error %d parsing Owner SID", __func__, rc); + cifs_dbg(FYI, "%s: Error %d parsing Owner SID\n", __func__, rc); return rc; } rc = sid_to_id(cifs_sb, owner_sid_ptr, fattr, SIDOWNER); if (rc) { - cFYI(1, "%s: Error %d mapping Owner SID to uid", __func__, rc); + cifs_dbg(FYI, "%s: Error %d mapping Owner SID to uid\n", + __func__, rc); return rc; } rc = parse_sid(group_sid_ptr, end_of_acl); if (rc) { - cFYI(1, "%s: Error %d mapping Owner SID to gid", __func__, rc); + cifs_dbg(FYI, "%s: Error %d mapping Owner SID to gid\n", + __func__, rc); return rc; } rc = sid_to_id(cifs_sb, group_sid_ptr, fattr, SIDGROUP); if (rc) { - cFYI(1, "%s: Error %d mapping Group SID to gid", __func__, rc); + cifs_dbg(FYI, "%s: Error %d mapping Group SID to gid\n", + __func__, rc); return rc; } @@ -780,7 +781,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, group_sid_ptr, fattr); else - cFYI(1, "no ACL"); /* BB grant all or default perms? */ + cifs_dbg(FYI, "no ACL\n"); /* BB grant all or default perms? */ return rc; } @@ -830,8 +831,8 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, id = from_kuid(&init_user_ns, uid); rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr); if (rc) { - cFYI(1, "%s: Mapping error %d for owner id %d", - __func__, rc, id); + cifs_dbg(FYI, "%s: Mapping error %d for owner id %d\n", + __func__, rc, id); kfree(nowner_sid_ptr); return rc; } @@ -850,8 +851,8 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, id = from_kgid(&init_user_ns, gid); rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr); if (rc) { - cFYI(1, "%s: Mapping error %d for group id %d", - __func__, rc, id); + cifs_dbg(FYI, "%s: Mapping error %d for group id %d\n", + __func__, rc, id); kfree(ngroup_sid_ptr); return rc; } @@ -881,7 +882,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, cifs_put_tlink(tlink); - cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen); + cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen); if (rc) return ERR_PTR(rc); return pntsd; @@ -918,7 +919,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, cifs_put_tlink(tlink); free_xid(xid); - cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen); + cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen); if (rc) return ERR_PTR(rc); return pntsd; @@ -972,12 +973,12 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, create_options, &fid, &oplock, NULL, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) { - cERROR(1, "Unable to open file to set ACL"); + cifs_dbg(VFS, "Unable to open file to set ACL\n"); goto out; } rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag); - cFYI(DBG2, "SetCIFSACL rc = %d", rc); + cifs_dbg(NOISY, "SetCIFSACL rc = %d\n", rc); CIFSSMBClose(xid, tcon, fid); out: @@ -995,7 +996,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, u32 acllen = 0; int rc = 0; - cFYI(DBG2, "converting ACL to mode for %s", path); + cifs_dbg(NOISY, "converting ACL to mode for %s\n", path); if (pfid) pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen); @@ -1005,12 +1006,12 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); - cERROR(1, "%s: error %d getting sec desc", __func__, rc); + cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); } else { rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr); kfree(pntsd); if (rc) - cERROR(1, "parse sec desc failed rc = %d", rc); + cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc); } return rc; @@ -1027,13 +1028,13 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ - cFYI(DBG2, "set ACL from mode for %s", path); + cifs_dbg(NOISY, "set ACL from mode for %s\n", path); /* Get the security descriptor */ pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); - cERROR(1, "%s: error %d getting sec desc", __func__, rc); + cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); goto out; } @@ -1046,7 +1047,6 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN); pnntsd = kmalloc(secdesclen, GFP_KERNEL); if (!pnntsd) { - cERROR(1, "Unable to allocate security descriptor"); kfree(pntsd); return -ENOMEM; } @@ -1054,12 +1054,12 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid, &aclflag); - cFYI(DBG2, "build_sec_desc rc: %d", rc); + cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); if (!rc) { /* Set the security descriptor */ rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag); - cFYI(DBG2, "set_cifs_acl rc: %d", rc); + cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); } kfree(pnntsd); diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 652f5051be09..3d8bf941d126 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -50,20 +50,20 @@ static int cifs_calc_signature(struct smb_rqst *rqst, return -EINVAL; if (!server->secmech.sdescmd5) { - cERROR(1, "%s: Can't generate signature", __func__); + cifs_dbg(VFS, "%s: Can't generate signature\n", __func__); return -1; } rc = crypto_shash_init(&server->secmech.sdescmd5->shash); if (rc) { - cERROR(1, "%s: Could not init md5", __func__); + cifs_dbg(VFS, "%s: Could not init md5\n", __func__); return rc; } rc = crypto_shash_update(&server->secmech.sdescmd5->shash, server->session_key.response, server->session_key.len); if (rc) { - cERROR(1, "%s: Could not update with response", __func__); + cifs_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; } @@ -71,7 +71,7 @@ static int cifs_calc_signature(struct smb_rqst *rqst, if (iov[i].iov_len == 0) continue; if (iov[i].iov_base == NULL) { - cERROR(1, "null iovec entry"); + cifs_dbg(VFS, "null iovec entry\n"); return -EIO; } /* The first entry includes a length field (which does not get @@ -88,8 +88,8 @@ static int cifs_calc_signature(struct smb_rqst *rqst, iov[i].iov_base, iov[i].iov_len); } if (rc) { - cERROR(1, "%s: Could not update with payload", - __func__); + cifs_dbg(VFS, "%s: Could not update with payload\n", + __func__); return rc; } } @@ -106,7 +106,7 @@ static int cifs_calc_signature(struct smb_rqst *rqst, rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); if (rc) - cERROR(1, "%s: Could not generate md5 hash", __func__); + cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); return rc; } @@ -135,8 +135,8 @@ int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server, cpu_to_le32(server->sequence_number); cifs_pdu->Signature.Sequence.Reserved = 0; - *pexpected_response_sequence_number = server->sequence_number++; - server->sequence_number++; + *pexpected_response_sequence_number = ++server->sequence_number; + ++server->sequence_number; rc = cifs_calc_signature(rqst, server, smb_signature); if (rc) @@ -196,8 +196,8 @@ int cifs_verify_signature(struct smb_rqst *rqst, /* Do not need to verify session setups with signature "BSRSPYL " */ if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0) - cFYI(1, "dummy signature received for smb command 0x%x", - cifs_pdu->Command); + cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n", + cifs_pdu->Command); /* save off the origiginal signature so we can modify the smb and check its signature against what the server sent */ @@ -235,30 +235,30 @@ int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp) return -EINVAL; ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL); - if (!ses->auth_key.response) { - cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len); + if (!ses->auth_key.response) return -ENOMEM; - } + ses->auth_key.len = temp_len; rc = SMBNTencrypt(ses->password, ses->server->cryptkey, ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp); if (rc) { - cFYI(1, "%s Can't generate NTLM response, error: %d", - __func__, rc); + cifs_dbg(FYI, "%s Can't generate NTLM response, error: %d\n", + __func__, rc); return rc; } rc = E_md4hash(ses->password, temp_key, nls_cp); if (rc) { - cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); + cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n", + __func__, rc); return rc; } rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE); if (rc) - cFYI(1, "%s Can't generate NTLM session key, error: %d", - __func__, rc); + cifs_dbg(FYI, "%s Can't generate NTLM session key, error: %d\n", + __func__, rc); return rc; } @@ -276,7 +276,6 @@ int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE); if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) { - memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE); memcpy(lnm_session_key, password_with_pad, CIFS_ENCPWD_SIZE); return 0; @@ -334,7 +333,6 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp) ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL); if (!ses->auth_key.response) { ses->auth_key.len = 0; - cERROR(1, "Challenge target info allocation failure"); return -ENOMEM; } @@ -415,12 +413,12 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, int rc = 0; int len; char nt_hash[CIFS_NTHASH_SIZE]; - wchar_t *user; + __le16 *user; wchar_t *domain; wchar_t *server; if (!ses->server->secmech.sdeschmacmd5) { - cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash"); + cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); return -1; } @@ -430,27 +428,26 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, CIFS_NTHASH_SIZE); if (rc) { - cERROR(1, "%s: Could not set NT Hash as a key", __func__); + cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__); return rc; } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { - cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5"); + cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__); return rc; } - /* convert ses->user_name to unicode and uppercase */ + /* convert ses->user_name to unicode */ len = ses->user_name ? strlen(ses->user_name) : 0; user = kmalloc(2 + (len * 2), GFP_KERNEL); if (user == NULL) { - cERROR(1, "calc_ntlmv2_hash: user mem alloc failure"); rc = -ENOMEM; return rc; } if (len) { - len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp); + len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp); UniStrupr(user); } else { memset(user, '\0', 2); @@ -460,7 +457,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, (char *)user, 2 * len); kfree(user); if (rc) { - cERROR(1, "%s: Could not update with user", __func__); + cifs_dbg(VFS, "%s: Could not update with user\n", __func__); return rc; } @@ -470,7 +467,6 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, domain = kmalloc(2 + (len * 2), GFP_KERNEL); if (domain == NULL) { - cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure"); rc = -ENOMEM; return rc; } @@ -481,8 +477,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, (char *)domain, 2 * len); kfree(domain); if (rc) { - cERROR(1, "%s: Could not update with domain", - __func__); + cifs_dbg(VFS, "%s: Could not update with domain\n", + __func__); return rc; } } else if (ses->serverName) { @@ -490,7 +486,6 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, server = kmalloc(2 + (len * 2), GFP_KERNEL); if (server == NULL) { - cERROR(1, "calc_ntlmv2_hash: server mem alloc failure"); rc = -ENOMEM; return rc; } @@ -501,8 +496,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, (char *)server, 2 * len); kfree(server); if (rc) { - cERROR(1, "%s: Could not update with server", - __func__); + cifs_dbg(VFS, "%s: Could not update with server\n", + __func__); return rc; } } @@ -510,7 +505,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ntlmv2_hash); if (rc) - cERROR(1, "%s: Could not generate md5 hash", __func__); + cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); return rc; } @@ -522,24 +517,25 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) unsigned int offset = CIFS_SESS_KEY_SIZE + 8; if (!ses->server->secmech.sdeschmacmd5) { - cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash"); + cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); return -1; } rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { - cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__); + cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", + __func__); return rc; } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { - cERROR(1, "CalcNTLMv2_response: could not init hmacmd5"); + cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__); return rc; } - if (ses->server->secType == RawNTLMSSP) + if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) memcpy(ses->auth_key.response + offset, ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); else @@ -548,14 +544,14 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + offset, ses->auth_key.len - offset); if (rc) { - cERROR(1, "%s: Could not update with response", __func__); + cifs_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + CIFS_SESS_KEY_SIZE); if (rc) - cERROR(1, "%s: Could not generate md5 hash", __func__); + cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); return rc; } @@ -571,18 +567,19 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) char ntlmv2_hash[16]; unsigned char *tiblob = NULL; /* target info blob */ - if (ses->server->secType == RawNTLMSSP) { + if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) { if (!ses->domainName) { rc = find_domain_name(ses, nls_cp); if (rc) { - cERROR(1, "error %d finding domain name", rc); + cifs_dbg(VFS, "error %d finding domain name\n", + rc); goto setup_ntlmv2_rsp_ret; } } } else { rc = build_avpair_blob(ses, nls_cp); if (rc) { - cERROR(1, "error %d building av pair blob", rc); + cifs_dbg(VFS, "error %d building av pair blob\n", rc); goto setup_ntlmv2_rsp_ret; } } @@ -595,7 +592,6 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) if (!ses->auth_key.response) { rc = ENOMEM; ses->auth_key.len = 0; - cERROR(1, "%s: Can't allocate auth blob", __func__); goto setup_ntlmv2_rsp_ret; } ses->auth_key.len += baselen; @@ -613,14 +609,14 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) /* calculate ntlmv2_hash */ rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp); if (rc) { - cERROR(1, "could not get v2 hash rc %d", rc); + cifs_dbg(VFS, "could not get v2 hash rc %d\n", rc); goto setup_ntlmv2_rsp_ret; } /* calculate first part of the client response (CR1) */ rc = CalcNTLMv2_response(ses, ntlmv2_hash); if (rc) { - cERROR(1, "Could not calculate CR1 rc: %d", rc); + cifs_dbg(VFS, "Could not calculate CR1 rc: %d\n", rc); goto setup_ntlmv2_rsp_ret; } @@ -628,13 +624,14 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { - cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__); + cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", + __func__); goto setup_ntlmv2_rsp_ret; } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { - cERROR(1, "%s: Could not init hmacmd5", __func__); + cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); goto setup_ntlmv2_rsp_ret; } @@ -642,14 +639,14 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) ses->auth_key.response + CIFS_SESS_KEY_SIZE, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { - cERROR(1, "%s: Could not update with response", __func__); + cifs_dbg(VFS, "%s: Could not update with response\n", __func__); goto setup_ntlmv2_rsp_ret; } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response); if (rc) - cERROR(1, "%s: Could not generate md5 hash", __func__); + cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); setup_ntlmv2_rsp_ret: kfree(tiblob); @@ -671,7 +668,7 @@ calc_seckey(struct cifs_ses *ses) tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_arc4)) { rc = PTR_ERR(tfm_arc4); - cERROR(1, "could not allocate crypto API arc4"); + cifs_dbg(VFS, "could not allocate crypto API arc4\n"); return rc; } @@ -680,7 +677,8 @@ calc_seckey(struct cifs_ses *ses) rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE); if (rc) { - cERROR(1, "%s: Could not set response as a key", __func__); + cifs_dbg(VFS, "%s: Could not set response as a key\n", + __func__); return rc; } @@ -689,7 +687,7 @@ calc_seckey(struct cifs_ses *ses) rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE); if (rc) { - cERROR(1, "could not encrypt session key rc: %d", rc); + cifs_dbg(VFS, "could not encrypt session key rc: %d\n", rc); crypto_free_blkcipher(tfm_arc4); return rc; } @@ -707,6 +705,9 @@ calc_seckey(struct cifs_ses *ses) void cifs_crypto_shash_release(struct TCP_Server_Info *server) { + if (server->secmech.cmacaes) + crypto_free_shash(server->secmech.cmacaes); + if (server->secmech.hmacsha256) crypto_free_shash(server->secmech.hmacsha256); @@ -716,6 +717,8 @@ cifs_crypto_shash_release(struct TCP_Server_Info *server) if (server->secmech.hmacmd5) crypto_free_shash(server->secmech.hmacmd5); + kfree(server->secmech.sdesccmacaes); + kfree(server->secmech.sdeschmacsha256); kfree(server->secmech.sdeschmacmd5); @@ -731,29 +734,35 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server) server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); if (IS_ERR(server->secmech.hmacmd5)) { - cERROR(1, "could not allocate crypto hmacmd5"); + cifs_dbg(VFS, "could not allocate crypto hmacmd5\n"); return PTR_ERR(server->secmech.hmacmd5); } server->secmech.md5 = crypto_alloc_shash("md5", 0, 0); if (IS_ERR(server->secmech.md5)) { - cERROR(1, "could not allocate crypto md5"); + cifs_dbg(VFS, "could not allocate crypto md5\n"); rc = PTR_ERR(server->secmech.md5); goto crypto_allocate_md5_fail; } server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0); if (IS_ERR(server->secmech.hmacsha256)) { - cERROR(1, "could not allocate crypto hmacsha256\n"); + cifs_dbg(VFS, "could not allocate crypto hmacsha256\n"); rc = PTR_ERR(server->secmech.hmacsha256); goto crypto_allocate_hmacsha256_fail; } + server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0); + if (IS_ERR(server->secmech.cmacaes)) { + cifs_dbg(VFS, "could not allocate crypto cmac-aes"); + rc = PTR_ERR(server->secmech.cmacaes); + goto crypto_allocate_cmacaes_fail; + } + size = sizeof(struct shash_desc) + crypto_shash_descsize(server->secmech.hmacmd5); server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL); if (!server->secmech.sdeschmacmd5) { - cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5"); rc = -ENOMEM; goto crypto_allocate_hmacmd5_sdesc_fail; } @@ -764,7 +773,6 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server) crypto_shash_descsize(server->secmech.md5); server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL); if (!server->secmech.sdescmd5) { - cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5"); rc = -ENOMEM; goto crypto_allocate_md5_sdesc_fail; } @@ -775,15 +783,28 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server) crypto_shash_descsize(server->secmech.hmacsha256); server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL); if (!server->secmech.sdeschmacsha256) { - cERROR(1, "%s: Can't alloc hmacsha256\n", __func__); rc = -ENOMEM; goto crypto_allocate_hmacsha256_sdesc_fail; } server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256; server->secmech.sdeschmacsha256->shash.flags = 0x0; + size = sizeof(struct shash_desc) + + crypto_shash_descsize(server->secmech.cmacaes); + server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL); + if (!server->secmech.sdesccmacaes) { + cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__); + rc = -ENOMEM; + goto crypto_allocate_cmacaes_sdesc_fail; + } + server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes; + server->secmech.sdesccmacaes->shash.flags = 0x0; + return 0; +crypto_allocate_cmacaes_sdesc_fail: + kfree(server->secmech.sdeschmacsha256); + crypto_allocate_hmacsha256_sdesc_fail: kfree(server->secmech.sdescmd5); @@ -791,6 +812,9 @@ crypto_allocate_md5_sdesc_fail: kfree(server->secmech.sdeschmacmd5); crypto_allocate_hmacmd5_sdesc_fail: + crypto_free_shash(server->secmech.cmacaes); + +crypto_allocate_cmacaes_fail: crypto_free_shash(server->secmech.hmacsha256); crypto_allocate_hmacsha256_fail: diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 345fc89c4286..4bdd547dbf6f 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -161,7 +161,7 @@ cifs_read_super(struct super_block *sb) #ifdef CONFIG_CIFS_NFSD_EXPORT if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { - cFYI(1, "export ops supported"); + cifs_dbg(FYI, "export ops supported\n"); sb->s_export_op = &cifs_export_ops; } #endif /* CONFIG_CIFS_NFSD_EXPORT */ @@ -169,7 +169,7 @@ cifs_read_super(struct super_block *sb) return 0; out_no_root: - cERROR(1, "cifs_read_super: get root inode failed"); + cifs_dbg(VFS, "%s: get root inode failed\n", __func__); return rc; } @@ -312,11 +312,14 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) } static void -cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server) +cifs_show_security(struct seq_file *s, struct cifs_ses *ses) { + if (ses->sectype == Unspecified) + return; + seq_printf(s, ",sec="); - switch (server->secType) { + switch (ses->sectype) { case LANMAN: seq_printf(s, "lanman"); break; @@ -338,7 +341,7 @@ cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server) break; } - if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + if (ses->sign) seq_printf(s, "i"); } @@ -369,12 +372,9 @@ cifs_show_options(struct seq_file *s, struct dentry *root) srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string); - cifs_show_security(s, tcon->ses->server); + cifs_show_security(s, tcon->ses); cifs_show_cache_flavor(s, cifs_sb); - seq_printf(s, ",unc="); - seq_escape(s, tcon->treeName, " \t\n\\"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) seq_printf(s, ",multiuser"); else if (tcon->ses->user_name) @@ -502,7 +502,7 @@ static void cifs_umount_begin(struct super_block *sb) /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ /* cancel_notify_requests(tcon); */ if (tcon->ses && tcon->ses->server) { - cFYI(1, "wake up tasks now - umount begin not complete"); + cifs_dbg(FYI, "wake up tasks now - umount begin not complete\n"); wake_up_all(&tcon->ses->server->request_q); wake_up_all(&tcon->ses->server->response_q); msleep(1); /* yield */ @@ -573,7 +573,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) if (full_path == NULL) return ERR_PTR(-ENOMEM); - cFYI(1, "Get root dentry for %s", full_path); + cifs_dbg(FYI, "Get root dentry for %s\n", full_path); sep = CIFS_DIR_SEP(cifs_sb); dentry = dget(sb->s_root); @@ -632,7 +632,7 @@ cifs_do_mount(struct file_system_type *fs_type, struct cifs_mnt_data mnt_data; struct dentry *root; - cFYI(1, "Devname: %s flags: %d ", dev_name, flags); + cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags); volume_info = cifs_get_volume_info((char *)data, dev_name); if (IS_ERR(volume_info)) @@ -655,7 +655,8 @@ cifs_do_mount(struct file_system_type *fs_type, rc = cifs_mount(cifs_sb, volume_info); if (rc) { if (!(flags & MS_SILENT)) - cERROR(1, "cifs_mount failed w/return code = %d", rc); + cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n", + rc); root = ERR_PTR(rc); goto out_mountdata; } @@ -675,7 +676,7 @@ cifs_do_mount(struct file_system_type *fs_type, } if (sb->s_root) { - cFYI(1, "Use existing superblock"); + cifs_dbg(FYI, "Use existing superblock\n"); cifs_umount(cifs_sb); } else { rc = cifs_read_super(sb); @@ -691,7 +692,7 @@ cifs_do_mount(struct file_system_type *fs_type, if (IS_ERR(root)) goto out_super; - cFYI(1, "dentry root is: %p", root); + cifs_dbg(FYI, "dentry root is: %p\n", root); goto out; out_super: @@ -723,7 +724,8 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, rc = filemap_fdatawrite(inode->i_mapping); if (rc) - cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode); + cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n", + rc, inode); return written; } @@ -766,7 +768,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) { - /* note that this is called by vfs setlease with lock_flocks held + /* note that this is called by vfs setlease with i_lock held to protect *lease from going away */ struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; @@ -969,7 +971,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { }; const struct file_operations cifs_dir_ops = { - .readdir = cifs_readdir, + .iterate = cifs_readdir, .release = cifs_closedir, .read = generic_read_dir, .unlocked_ioctl = cifs_ioctl, @@ -1030,7 +1032,10 @@ cifs_init_request_bufs(void) } else { CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/ } -/* cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */ +/* + cifs_dbg(VFS, "CIFSMaxBufSize %d 0x%x\n", + CIFSMaxBufSize, CIFSMaxBufSize); +*/ cifs_req_cachep = kmem_cache_create("cifs_request", CIFSMaxBufSize + max_hdr_size, 0, SLAB_HWCACHE_ALIGN, NULL); @@ -1041,7 +1046,7 @@ cifs_init_request_bufs(void) cifs_min_rcv = 1; else if (cifs_min_rcv > 64) { cifs_min_rcv = 64; - cERROR(1, "cifs_min_rcv set to maximum (64)"); + cifs_dbg(VFS, "cifs_min_rcv set to maximum (64)\n"); } cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv, @@ -1072,7 +1077,7 @@ cifs_init_request_bufs(void) cifs_min_small = 2; else if (cifs_min_small > 256) { cifs_min_small = 256; - cFYI(1, "cifs_min_small set to maximum (256)"); + cifs_dbg(FYI, "cifs_min_small set to maximum (256)\n"); } cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small, @@ -1163,10 +1168,11 @@ init_cifs(void) if (cifs_max_pending < 2) { cifs_max_pending = 2; - cFYI(1, "cifs_max_pending set to min of 2"); + cifs_dbg(FYI, "cifs_max_pending set to min of 2\n"); } else if (cifs_max_pending > CIFS_MAX_REQ) { cifs_max_pending = CIFS_MAX_REQ; - cFYI(1, "cifs_max_pending set to max of %u", CIFS_MAX_REQ); + cifs_dbg(FYI, "cifs_max_pending set to max of %u\n", + CIFS_MAX_REQ); } cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); @@ -1235,7 +1241,7 @@ out_clean_proc: static void __exit exit_cifs(void) { - cFYI(DBG2, "exit_cifs"); + cifs_dbg(NOISY, "exit_cifs\n"); unregister_filesystem(&cifs_fs_type); cifs_dfs_release_automount_timer(); #ifdef CONFIG_CIFS_ACL diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 0e32c3446ce9..ea723a5e8226 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -101,7 +101,7 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *); extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *); extern const struct file_operations cifs_dir_ops; extern int cifs_dir_open(struct inode *inode, struct file *file); -extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); +extern int cifs_readdir(struct file *file, struct dir_context *ctx); /* Functions related to dir entries */ extern const struct dentry_operations cifs_dentry_ops; @@ -132,5 +132,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.0" +#define CIFS_VERSION "2.01" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 4f07f6fbe494..e66b08882548 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -101,20 +101,14 @@ enum statusEnum { }; enum securityEnum { - LANMAN = 0, /* Legacy LANMAN auth */ + Unspecified = 0, /* not specified */ + LANMAN, /* Legacy LANMAN auth */ NTLM, /* Legacy NTLM012 auth with NTLM hash */ NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ -/* NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */ Kerberos, /* Kerberos via SPNEGO */ }; -enum protocolEnum { - TCP = 0, - SCTP - /* Netbios frames protocol not supported at this time */ -}; - struct session_key { unsigned int len; char *response; @@ -131,9 +125,11 @@ struct cifs_secmech { struct crypto_shash *hmacmd5; /* hmac-md5 hash function */ struct crypto_shash *md5; /* md5 hash function */ struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */ + struct crypto_shash *cmacaes; /* block-cipher based MAC function */ struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */ struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */ + struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */ }; /* per smb session structure/fields */ @@ -181,6 +177,7 @@ enum smb_version { Smb_20, Smb_21, Smb_30, + Smb_302, }; struct mid_q_entry; @@ -228,6 +225,7 @@ struct smb_version_operations { void (*dump_detail)(void *); void (*clear_stats)(struct cifs_tcon *); void (*print_stats)(struct seq_file *m, struct cifs_tcon *); + void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *); /* verify the message */ int (*check_message)(char *, unsigned int); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); @@ -367,6 +365,8 @@ struct smb_version_operations { void (*set_lease_key)(struct inode *, struct cifs_fid *fid); /* generate new lease key */ void (*new_lease_key)(struct cifs_fid *fid); + /* The next two functions will need to be changed to per smb session */ + void (*generate_signingkey)(struct TCP_Server_Info *server); int (*calc_signature)(struct smb_rqst *rqst, struct TCP_Server_Info *server); }; @@ -387,6 +387,8 @@ struct smb_version_values { unsigned int cap_nt_find; unsigned int cap_large_files; unsigned int oplock_read; + __u16 signing_enabled; + __u16 signing_required; }; #define HEADER_SIZE(server) (server->vals->header_size) @@ -407,7 +409,8 @@ struct smb_vol { kgid_t backupgid; umode_t file_mode; umode_t dir_mode; - unsigned secFlg; + enum securityEnum sectype; /* sectype requested via mnt opts */ + bool sign; /* was signing requested via mnt opts? */ bool retry:1; bool intr:1; bool setuids:1; @@ -441,6 +444,7 @@ struct smb_vol { bool mfsymlinks:1; /* use Minshall+French Symlinks */ bool multiuser:1; bool rwpidforward:1; /* pid forward for read/write operations */ + bool nosharesock; unsigned int rsize; unsigned int wsize; bool sockopt_tcp_nodelay:1; @@ -514,6 +518,7 @@ struct TCP_Server_Info { struct task_struct *tsk; char server_GUID[16]; __u16 sec_mode; + bool sign; /* is signing enabled on this connection? */ bool session_estab; /* mark when very first sess is established */ #ifdef CONFIG_CIFS_SMB2 int echo_credits; /* echo reserved slots */ @@ -521,7 +526,6 @@ struct TCP_Server_Info { bool echoes:1; /* enable echoes */ #endif u16 dialect; /* dialect index that server chose */ - enum securityEnum secType; bool oplocks:1; /* enable oplocks */ unsigned int maxReq; /* Clients should submit no more */ /* than maxReq distinct unanswered SMBs to the server when using */ @@ -540,12 +544,17 @@ struct TCP_Server_Info { int timeAdj; /* Adjust for difference in server time zone in sec */ __u64 CurrentMid; /* multiplex id - rotating counter */ char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ + char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */ /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; __u32 sequence_number; /* for signing, protected by srv_mutex */ struct session_key session_key; unsigned long lstrp; /* when we got last response from this server */ struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ +#define CIFS_NEGFLAVOR_LANMAN 0 /* wct == 13, LANMAN */ +#define CIFS_NEGFLAVOR_UNENCAP 1 /* wct == 17, but no ext_sec */ +#define CIFS_NEGFLAVOR_EXTENDED 2 /* wct == 17, ext_sec bit set */ + char negflavor; /* NEGOTIATE response flavor */ /* extended security flavors that server supports */ bool sec_ntlmssp; /* supports NTLMSSP */ bool sec_kerberosu2u; /* supports U2U Kerberos */ @@ -697,7 +706,6 @@ struct cifs_ses { enum statusEnum status; unsigned overrideSecFlg; /* if non-zero override global sec flags */ __u16 ipc_tid; /* special tid for connection to IPC share */ - __u16 flags; __u16 vcnum; char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ @@ -714,21 +722,14 @@ struct cifs_ses { char *password; struct session_key auth_key; struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ + enum securityEnum sectype; /* what security flavor was specified? */ + bool sign; /* is signing required? */ bool need_reconnect:1; /* connection reset, uid now invalid */ #ifdef CONFIG_CIFS_SMB2 __u16 session_flags; #endif /* CONFIG_CIFS_SMB2 */ }; -/* no more than one of the following three session flags may be set */ -#define CIFS_SES_NT4 1 -#define CIFS_SES_OS2 2 -#define CIFS_SES_W9X 4 -/* following flag is set for old servers such as OS2 (and Win95?) - which do not negotiate NTLM or POSIX dialects, but instead - negotiate one of the older LANMAN dialects */ -#define CIFS_SES_LANMAN 8 - static inline bool cap_unix(struct cifs_ses *ses) { @@ -816,7 +817,7 @@ struct cifs_tcon { #ifdef CONFIG_CIFS_SMB2 bool print:1; /* set if connection to printer share */ bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */ - __u32 capabilities; + __le32 capabilities; __u32 share_flags; __u32 maximal_access; __u32 vol_serial_number; @@ -1348,7 +1349,7 @@ require use of the stronger protocol */ #define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ #define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ -#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP) +#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP) #define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) #define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) /* @@ -1494,4 +1495,7 @@ extern struct smb_version_values smb21_values; #define SMB30_VERSION_STRING "3.0" extern struct smb_version_operations smb30_operations; extern struct smb_version_values smb30_values; +#define SMB302_VERSION_STRING "3.02" +/*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */ +extern struct smb_version_values smb302_values; #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index e996ff6b26d1..11ca24a8e054 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -142,6 +142,11 @@ */ #define CIFS_SESS_KEY_SIZE (16) +/* + * Size of the smb3 signing key + */ +#define SMB3_SIGN_KEY_SIZE (16) + #define CIFS_CLIENT_CHALLENGE_SIZE (8) #define CIFS_SERVER_CHALLENGE_SIZE (8) #define CIFS_HMAC_MD5_HASH_SIZE (16) @@ -531,7 +536,7 @@ typedef struct lanman_neg_rsp { #define READ_RAW_ENABLE 1 #define WRITE_RAW_ENABLE 2 #define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE) - +#define SMB1_CLIENT_GUID_SIZE (16) typedef struct negotiate_rsp { struct smb_hdr hdr; /* wct = 17 */ __le16 DialectIndex; /* 0xFFFF = no dialect acceptable */ @@ -553,7 +558,7 @@ typedef struct negotiate_rsp { /* followed by 16 bytes of server GUID */ /* then security blob if cap_extended_security negotiated */ struct { - unsigned char GUID[16]; + unsigned char GUID[SMB1_CLIENT_GUID_SIZE]; unsigned char SecurityBlob[1]; } __attribute__((packed)) extended_response; } __attribute__((packed)) u; @@ -1315,6 +1320,14 @@ typedef struct smb_com_ntransact_rsp { /* parms and data follow */ } __attribute__((packed)) NTRANSACT_RSP; +/* See MS-SMB 2.2.7.2.1.1 */ +struct srv_copychunk { + __le64 SourceOffset; + __le64 DestinationOffset; + __le32 CopyLength; + __u32 Reserved; +} __packed; + typedef struct smb_com_transaction_ioctl_req { struct smb_hdr hdr; /* wct = 23 */ __u8 MaxSetupCount; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index f450f0683ddd..c8ff018fae68 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -45,17 +45,17 @@ extern void _free_xid(unsigned int); #define get_xid() \ ({ \ unsigned int __xid = _get_xid(); \ - cFYI(1, "CIFS VFS: in %s as Xid: %u with uid: %d", \ - __func__, __xid, \ - from_kuid(&init_user_ns, current_fsuid())); \ + cifs_dbg(FYI, "CIFS VFS: in %s as Xid: %u with uid: %d\n", \ + __func__, __xid, \ + from_kuid(&init_user_ns, current_fsuid())); \ __xid; \ }) #define free_xid(curr_xid) \ do { \ _free_xid(curr_xid); \ - cFYI(1, "CIFS VFS: leaving %s (xid = %u) rc = %d", \ - __func__, curr_xid, (int)rc); \ + cifs_dbg(FYI, "CIFS VFS: leaving %s (xid = %u) rc = %d\n", \ + __func__, curr_xid, (int)rc); \ } while (0) extern int init_cifs_idmap(void); extern void exit_cifs_idmap(void); @@ -118,6 +118,8 @@ extern void header_assemble(struct smb_hdr *, char /* command */ , extern int small_smb_init_no_tc(const int smb_cmd, const int wct, struct cifs_ses *ses, void **request_buf); +extern enum securityEnum select_sectype(struct TCP_Server_Info *server, + enum securityEnum requested); extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_cp); extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); @@ -212,6 +214,7 @@ extern int cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses); extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct nls_table *nls_info); +extern int cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required); extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses); extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses, @@ -433,6 +436,7 @@ extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); extern void cifs_crypto_shash_release(struct TCP_Server_Info *); extern int calc_seckey(struct cifs_ses *); +extern void generate_smb3signingkey(struct TCP_Server_Info *); #ifdef CONFIG_CIFS_WEAK_PW_HASH extern int calc_lanman_hash(const char *password, const char *cryptkey, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 8e2e799e7a24..a89c4cb4e6cf 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -139,8 +139,8 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) if (smb_command != SMB_COM_WRITE_ANDX && smb_command != SMB_COM_OPEN_ANDX && smb_command != SMB_COM_TREE_DISCONNECT) { - cFYI(1, "can not send cmd %d while umounting", - smb_command); + cifs_dbg(FYI, "can not send cmd %d while umounting\n", + smb_command); return -ENODEV; } } @@ -163,7 +163,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) * back on-line */ if (!tcon->retry) { - cFYI(1, "gave up waiting on reconnect in smb_init"); + cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n"); return -EHOSTDOWN; } } @@ -191,7 +191,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) cifs_mark_open_files_invalid(tcon); rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); mutex_unlock(&ses->session_mutex); - cFYI(1, "reconnect tcon rc = %d", rc); + cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) goto out; @@ -367,6 +367,185 @@ vt2_err: return -EINVAL; } +static int +decode_ext_sec_blob(struct cifs_ses *ses, NEGOTIATE_RSP *pSMBr) +{ + int rc = 0; + u16 count; + char *guid = pSMBr->u.extended_response.GUID; + struct TCP_Server_Info *server = ses->server; + + count = get_bcc(&pSMBr->hdr); + if (count < SMB1_CLIENT_GUID_SIZE) + return -EIO; + + spin_lock(&cifs_tcp_ses_lock); + if (server->srv_count > 1) { + spin_unlock(&cifs_tcp_ses_lock); + if (memcmp(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE) != 0) { + cifs_dbg(FYI, "server UID changed\n"); + memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE); + } + } else { + spin_unlock(&cifs_tcp_ses_lock); + memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE); + } + + if (count == SMB1_CLIENT_GUID_SIZE) { + server->sec_ntlmssp = true; + } else { + count -= SMB1_CLIENT_GUID_SIZE; + rc = decode_negTokenInit( + pSMBr->u.extended_response.SecurityBlob, count, server); + if (rc != 1) + return -EINVAL; + } + + return 0; +} + +int +cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required) +{ + bool srv_sign_required = server->sec_mode & server->vals->signing_required; + bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled; + bool mnt_sign_enabled = global_secflags & CIFSSEC_MAY_SIGN; + + /* + * Is signing required by mnt options? If not then check + * global_secflags to see if it is there. + */ + if (!mnt_sign_required) + mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) == + CIFSSEC_MUST_SIGN); + + /* + * If signing is required then it's automatically enabled too, + * otherwise, check to see if the secflags allow it. + */ + mnt_sign_enabled = mnt_sign_required ? mnt_sign_required : + (global_secflags & CIFSSEC_MAY_SIGN); + + /* If server requires signing, does client allow it? */ + if (srv_sign_required) { + if (!mnt_sign_enabled) { + cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!"); + return -ENOTSUPP; + } + server->sign = true; + } + + /* If client requires signing, does server allow it? */ + if (mnt_sign_required) { + if (!srv_sign_enabled) { + cifs_dbg(VFS, "Server does not support signing!"); + return -ENOTSUPP; + } + server->sign = true; + } + + return 0; +} + +#ifdef CONFIG_CIFS_WEAK_PW_HASH +static int +decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr) +{ + __s16 tmp; + struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr; + + if (server->dialect != LANMAN_PROT && server->dialect != LANMAN2_PROT) + return -EOPNOTSUPP; + + server->sec_mode = le16_to_cpu(rsp->SecurityMode); + server->maxReq = min_t(unsigned int, + le16_to_cpu(rsp->MaxMpxCount), + cifs_max_pending); + set_credits(server, server->maxReq); + server->maxBuf = le16_to_cpu(rsp->MaxBufSize); + server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); + /* even though we do not use raw we might as well set this + accurately, in case we ever find a need for it */ + if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { + server->max_rw = 0xFF00; + server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE; + } else { + server->max_rw = 0;/* do not need to use raw anyway */ + server->capabilities = CAP_MPX_MODE; + } + tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone); + if (tmp == -1) { + /* OS/2 often does not set timezone therefore + * we must use server time to calc time zone. + * Could deviate slightly from the right zone. + * Smallest defined timezone difference is 15 minutes + * (i.e. Nepal). Rounding up/down is done to match + * this requirement. + */ + int val, seconds, remain, result; + struct timespec ts, utc; + utc = CURRENT_TIME; + ts = cnvrtDosUnixTm(rsp->SrvTime.Date, + rsp->SrvTime.Time, 0); + cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n", + (int)ts.tv_sec, (int)utc.tv_sec, + (int)(utc.tv_sec - ts.tv_sec)); + val = (int)(utc.tv_sec - ts.tv_sec); + seconds = abs(val); + result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ; + remain = seconds % MIN_TZ_ADJ; + if (remain >= (MIN_TZ_ADJ / 2)) + result += MIN_TZ_ADJ; + if (val < 0) + result = -result; + server->timeAdj = result; + } else { + server->timeAdj = (int)tmp; + server->timeAdj *= 60; /* also in seconds */ + } + cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj); + + + /* BB get server time for time conversions and add + code to use it and timezone since this is not UTC */ + + if (rsp->EncryptionKeyLength == + cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) { + memcpy(server->cryptkey, rsp->EncryptionKey, + CIFS_CRYPTO_KEY_SIZE); + } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { + return -EIO; /* need cryptkey unless plain text */ + } + + cifs_dbg(FYI, "LANMAN negotiated\n"); + return 0; +} +#else +static inline int +decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr) +{ + cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n"); + return -EOPNOTSUPP; +} +#endif + +static bool +should_set_ext_sec_flag(enum securityEnum sectype) +{ + switch (sectype) { + case RawNTLMSSP: + case Kerberos: + return true; + case Unspecified: + if (global_secflags & + (CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)) + return true; + /* Fallthrough */ + default: + return false; + } +} + int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) { @@ -375,41 +554,24 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) int rc = 0; int bytes_returned; int i; - struct TCP_Server_Info *server; + struct TCP_Server_Info *server = ses->server; u16 count; - unsigned int secFlags; - if (ses->server) - server = ses->server; - else { - rc = -EIO; - return rc; + if (!server) { + WARN(1, "%s: server is NULL!\n", __func__); + return -EIO; } + rc = smb_init(SMB_COM_NEGOTIATE, 0, NULL /* no tcon yet */ , (void **) &pSMB, (void **) &pSMBr); if (rc) return rc; - /* if any of auth flags (ie not sign or seal) are overriden use them */ - if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) - secFlags = ses->overrideSecFlg; /* BB FIXME fix sign flags? */ - else /* if override flags set only sign/seal OR them with global auth */ - secFlags = global_secflags | ses->overrideSecFlg; - - cFYI(1, "secFlags 0x%x", secFlags); - pSMB->hdr.Mid = get_next_mid(server); pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); - if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) - pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; - else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) { - cFYI(1, "Kerberos only mechanism, enable extended security"); - pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; - } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP) - pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; - else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) { - cFYI(1, "NTLMSSP only mechanism, enable extended security"); + if (should_set_ext_sec_flag(ses->sectype)) { + cifs_dbg(FYI, "Requesting extended security."); pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; } @@ -428,7 +590,7 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) goto neg_err_exit; server->dialect = le16_to_cpu(pSMBr->DialectIndex); - cFYI(1, "Dialect: %d", server->dialect); + cifs_dbg(FYI, "Dialect: %d\n", server->dialect); /* Check wct = 1 error case */ if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) { /* core returns wct = 1, but we do not ask for core - otherwise @@ -436,129 +598,20 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) could not negotiate a common dialect */ rc = -EOPNOTSUPP; goto neg_err_exit; -#ifdef CONFIG_CIFS_WEAK_PW_HASH - } else if ((pSMBr->hdr.WordCount == 13) - && ((server->dialect == LANMAN_PROT) - || (server->dialect == LANMAN2_PROT))) { - __s16 tmp; - struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr; - - if ((secFlags & CIFSSEC_MAY_LANMAN) || - (secFlags & CIFSSEC_MAY_PLNTXT)) - server->secType = LANMAN; - else { - cERROR(1, "mount failed weak security disabled" - " in /proc/fs/cifs/SecurityFlags"); - rc = -EOPNOTSUPP; - goto neg_err_exit; - } - server->sec_mode = le16_to_cpu(rsp->SecurityMode); - server->maxReq = min_t(unsigned int, - le16_to_cpu(rsp->MaxMpxCount), - cifs_max_pending); - set_credits(server, server->maxReq); - server->maxBuf = le16_to_cpu(rsp->MaxBufSize); - server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); - /* even though we do not use raw we might as well set this - accurately, in case we ever find a need for it */ - if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { - server->max_rw = 0xFF00; - server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE; - } else { - server->max_rw = 0;/* do not need to use raw anyway */ - server->capabilities = CAP_MPX_MODE; - } - tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone); - if (tmp == -1) { - /* OS/2 often does not set timezone therefore - * we must use server time to calc time zone. - * Could deviate slightly from the right zone. - * Smallest defined timezone difference is 15 minutes - * (i.e. Nepal). Rounding up/down is done to match - * this requirement. - */ - int val, seconds, remain, result; - struct timespec ts, utc; - utc = CURRENT_TIME; - ts = cnvrtDosUnixTm(rsp->SrvTime.Date, - rsp->SrvTime.Time, 0); - cFYI(1, "SrvTime %d sec since 1970 (utc: %d) diff: %d", - (int)ts.tv_sec, (int)utc.tv_sec, - (int)(utc.tv_sec - ts.tv_sec)); - val = (int)(utc.tv_sec - ts.tv_sec); - seconds = abs(val); - result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ; - remain = seconds % MIN_TZ_ADJ; - if (remain >= (MIN_TZ_ADJ / 2)) - result += MIN_TZ_ADJ; - if (val < 0) - result = -result; - server->timeAdj = result; - } else { - server->timeAdj = (int)tmp; - server->timeAdj *= 60; /* also in seconds */ - } - cFYI(1, "server->timeAdj: %d seconds", server->timeAdj); - - - /* BB get server time for time conversions and add - code to use it and timezone since this is not UTC */ - - if (rsp->EncryptionKeyLength == - cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) { - memcpy(ses->server->cryptkey, rsp->EncryptionKey, - CIFS_CRYPTO_KEY_SIZE); - } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { - rc = -EIO; /* need cryptkey unless plain text */ - goto neg_err_exit; - } - - cFYI(1, "LANMAN negotiated"); - /* we will not end up setting signing flags - as no signing - was in LANMAN and server did not return the flags on */ - goto signing_check; -#else /* weak security disabled */ } else if (pSMBr->hdr.WordCount == 13) { - cERROR(1, "mount failed, cifs module not built " - "with CIFS_WEAK_PW_HASH support"); - rc = -EOPNOTSUPP; -#endif /* WEAK_PW_HASH */ - goto neg_err_exit; + server->negflavor = CIFS_NEGFLAVOR_LANMAN; + rc = decode_lanman_negprot_rsp(server, pSMBr); + goto signing_check; } else if (pSMBr->hdr.WordCount != 17) { /* unknown wct */ rc = -EOPNOTSUPP; goto neg_err_exit; } - /* else wct == 17 NTLM */ + /* else wct == 17, NTLM or better */ + server->sec_mode = pSMBr->SecurityMode; if ((server->sec_mode & SECMODE_USER) == 0) - cFYI(1, "share mode security"); - - if ((server->sec_mode & SECMODE_PW_ENCRYPT) == 0) -#ifdef CONFIG_CIFS_WEAK_PW_HASH - if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0) -#endif /* CIFS_WEAK_PW_HASH */ - cERROR(1, "Server requests plain text password" - " but client support disabled"); - - if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2) - server->secType = NTLMv2; - else if (secFlags & CIFSSEC_MAY_NTLM) - server->secType = NTLM; - else if (secFlags & CIFSSEC_MAY_NTLMV2) - server->secType = NTLMv2; - else if (secFlags & CIFSSEC_MAY_KRB5) - server->secType = Kerberos; - else if (secFlags & CIFSSEC_MAY_NTLMSSP) - server->secType = RawNTLMSSP; - else if (secFlags & CIFSSEC_MAY_LANMAN) - server->secType = LANMAN; - else { - rc = -EOPNOTSUPP; - cERROR(1, "Invalid security type"); - goto neg_err_exit; - } - /* else ... any others ...? */ + cifs_dbg(FYI, "share mode security\n"); /* one byte, so no need to convert this or EncryptionKeyLen from little endian */ @@ -568,100 +621,34 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) /* probably no need to store and check maxvcs */ server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize); server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); - cFYI(DBG2, "Max buf = %d", ses->server->maxBuf); + cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf); server->capabilities = le32_to_cpu(pSMBr->Capabilities); server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); server->timeAdj *= 60; + if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { + server->negflavor = CIFS_NEGFLAVOR_UNENCAP; memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey, CIFS_CRYPTO_KEY_SIZE); } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC || server->capabilities & CAP_EXTENDED_SECURITY) && (pSMBr->EncryptionKeyLength == 0)) { - /* decode security blob */ - count = get_bcc(&pSMBr->hdr); - if (count < 16) { - rc = -EIO; - goto neg_err_exit; - } - spin_lock(&cifs_tcp_ses_lock); - if (server->srv_count > 1) { - spin_unlock(&cifs_tcp_ses_lock); - if (memcmp(server->server_GUID, - pSMBr->u.extended_response. - GUID, 16) != 0) { - cFYI(1, "server UID changed"); - memcpy(server->server_GUID, - pSMBr->u.extended_response.GUID, - 16); - } - } else { - spin_unlock(&cifs_tcp_ses_lock); - memcpy(server->server_GUID, - pSMBr->u.extended_response.GUID, 16); - } - - if (count == 16) { - server->secType = RawNTLMSSP; - } else { - rc = decode_negTokenInit(pSMBr->u.extended_response. - SecurityBlob, count - 16, - server); - if (rc == 1) - rc = 0; - else - rc = -EINVAL; - if (server->secType == Kerberos) { - if (!server->sec_kerberos && - !server->sec_mskerberos) - rc = -EOPNOTSUPP; - } else if (server->secType == RawNTLMSSP) { - if (!server->sec_ntlmssp) - rc = -EOPNOTSUPP; - } else - rc = -EOPNOTSUPP; - } + server->negflavor = CIFS_NEGFLAVOR_EXTENDED; + rc = decode_ext_sec_blob(ses, pSMBr); } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { rc = -EIO; /* no crypt key only if plain text pwd */ - goto neg_err_exit; - } else - server->capabilities &= ~CAP_EXTENDED_SECURITY; - -#ifdef CONFIG_CIFS_WEAK_PW_HASH -signing_check: -#endif - if ((secFlags & CIFSSEC_MAY_SIGN) == 0) { - /* MUST_SIGN already includes the MAY_SIGN FLAG - so if this is zero it means that signing is disabled */ - cFYI(1, "Signing disabled"); - if (server->sec_mode & SECMODE_SIGN_REQUIRED) { - cERROR(1, "Server requires " - "packet signing to be enabled in " - "/proc/fs/cifs/SecurityFlags."); - rc = -EOPNOTSUPP; - } - server->sec_mode &= - ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); - } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) { - /* signing required */ - cFYI(1, "Must sign - secFlags 0x%x", secFlags); - if ((server->sec_mode & - (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) { - cERROR(1, "signing required but server lacks support"); - rc = -EOPNOTSUPP; - } else - server->sec_mode |= SECMODE_SIGN_REQUIRED; } else { - /* signing optional ie CIFSSEC_MAY_SIGN */ - if ((server->sec_mode & SECMODE_SIGN_REQUIRED) == 0) - server->sec_mode &= - ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); + server->negflavor = CIFS_NEGFLAVOR_UNENCAP; + server->capabilities &= ~CAP_EXTENDED_SECURITY; } +signing_check: + if (!rc) + rc = cifs_enable_signing(server, ses->sign); neg_err_exit: cifs_buf_release(pSMB); - cFYI(1, "negprot rc %d", rc); + cifs_dbg(FYI, "negprot rc %d\n", rc); return rc; } @@ -671,7 +658,7 @@ CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon) struct smb_hdr *smb_buffer; int rc = 0; - cFYI(1, "In tree disconnect"); + cifs_dbg(FYI, "In tree disconnect\n"); /* BB: do we need to check this? These should never be NULL. */ if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) @@ -693,7 +680,7 @@ CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon) rc = SendReceiveNoRsp(xid, tcon->ses, (char *)smb_buffer, 0); if (rc) - cFYI(1, "Tree disconnect failed %d", rc); + cifs_dbg(FYI, "Tree disconnect failed %d\n", rc); /* No need to return error on this operation if tid invalidated and closed on server already e.g. due to tcp session crashing */ @@ -728,7 +715,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server) struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 }; - cFYI(1, "In echo request"); + cifs_dbg(FYI, "In echo request\n"); rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb); if (rc) @@ -747,7 +734,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server) rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback, server, CIFS_ASYNC_OP | CIFS_ECHO_OP); if (rc) - cFYI(1, "Echo request failed: %d", rc); + cifs_dbg(FYI, "Echo request failed: %d\n", rc); cifs_small_buf_release(smb); @@ -760,7 +747,7 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses) LOGOFF_ANDX_REQ *pSMB; int rc = 0; - cFYI(1, "In SMBLogoff for session disconnect"); + cifs_dbg(FYI, "In SMBLogoff for session disconnect\n"); /* * BB: do we need to check validity of ses and server? They should @@ -782,9 +769,8 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses) pSMB->hdr.Mid = get_next_mid(ses->server); - if (ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + if (ses->server->sign) + pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; pSMB->hdr.Uid = ses->Suid; @@ -814,7 +800,7 @@ CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, param_offset, offset, byte_count; - cFYI(1, "In POSIX delete"); + cifs_dbg(FYI, "In POSIX delete\n"); PsxDelete: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -866,7 +852,7 @@ PsxDelete: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) - cFYI(1, "Posix delete returned %d", rc); + cifs_dbg(FYI, "Posix delete returned %d\n", rc); cifs_buf_release(pSMB); cifs_stats_inc(&tcon->stats.cifs_stats.num_deletes); @@ -914,7 +900,7 @@ DelFileRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_deletes); if (rc) - cFYI(1, "Error in RMFile = %d", rc); + cifs_dbg(FYI, "Error in RMFile = %d\n", rc); cifs_buf_release(pSMB); if (rc == -EAGAIN) @@ -934,7 +920,7 @@ CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, int name_len; int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; - cFYI(1, "In CIFSSMBRmDir"); + cifs_dbg(FYI, "In CIFSSMBRmDir\n"); RmDirRetry: rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -960,7 +946,7 @@ RmDirRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_rmdirs); if (rc) - cFYI(1, "Error in RMDir = %d", rc); + cifs_dbg(FYI, "Error in RMDir = %d\n", rc); cifs_buf_release(pSMB); if (rc == -EAGAIN) @@ -979,7 +965,7 @@ CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, int name_len; int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; - cFYI(1, "In CIFSSMBMkDir"); + cifs_dbg(FYI, "In CIFSSMBMkDir\n"); MkDirRetry: rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -1005,7 +991,7 @@ MkDirRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_mkdirs); if (rc) - cFYI(1, "Error in Mkdir = %d", rc); + cifs_dbg(FYI, "Error in Mkdir = %d\n", rc); cifs_buf_release(pSMB); if (rc == -EAGAIN) @@ -1029,7 +1015,7 @@ CIFSPOSIXCreate(const unsigned int xid, struct cifs_tcon *tcon, OPEN_PSX_REQ *pdata; OPEN_PSX_RSP *psx_rsp; - cFYI(1, "In POSIX Create"); + cifs_dbg(FYI, "In POSIX Create\n"); PsxCreat: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -1083,11 +1069,11 @@ PsxCreat: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Posix create returned %d", rc); + cifs_dbg(FYI, "Posix create returned %d\n", rc); goto psx_create_err; } - cFYI(1, "copying inode info"); + cifs_dbg(FYI, "copying inode info\n"); rc = validate_t2((struct smb_t2_rsp *)pSMBr); if (rc || get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)) { @@ -1109,11 +1095,11 @@ PsxCreat: /* check to make sure response data is there */ if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) { pRetData->Type = cpu_to_le32(-1); /* unknown */ - cFYI(DBG2, "unknown type"); + cifs_dbg(NOISY, "unknown type\n"); } else { if (get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP) + sizeof(FILE_UNIX_BASIC_INFO)) { - cERROR(1, "Open response data too small"); + cifs_dbg(VFS, "Open response data too small\n"); pRetData->Type = cpu_to_le32(-1); goto psx_create_err; } @@ -1160,7 +1146,7 @@ static __u16 convert_disposition(int disposition) ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC; break; default: - cFYI(1, "unknown disposition %d", disposition); + cifs_dbg(FYI, "unknown disposition %d\n", disposition); ofun = SMBOPEN_OAPPEND; /* regular open */ } return ofun; @@ -1251,7 +1237,7 @@ OldOpenRetry: (struct smb_hdr *)pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_opens); if (rc) { - cFYI(1, "Error in Open = %d", rc); + cifs_dbg(FYI, "Error in Open = %d\n", rc); } else { /* BB verify if wct == 15 */ @@ -1364,7 +1350,7 @@ openRetry: (struct smb_hdr *)pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_opens); if (rc) { - cFYI(1, "Error in Open = %d", rc); + cifs_dbg(FYI, "Error in Open = %d\n", rc); } else { *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */ *netfid = pSMBr->Fid; /* cifs fid stays in le */ @@ -1425,8 +1411,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) char *buf = server->smallbuf; unsigned int buflen = get_rfc1002_length(buf) + 4; - cFYI(1, "%s: mid=%llu offset=%llu bytes=%u", __func__, - mid->mid, rdata->offset, rdata->bytes); + cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n", + __func__, mid->mid, rdata->offset, rdata->bytes); /* * read the rest of READ_RSP header (sans Data array), or whatever we @@ -1447,16 +1433,16 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) /* Was the SMB read successful? */ rdata->result = server->ops->map_error(buf, false); if (rdata->result != 0) { - cFYI(1, "%s: server returned error %d", __func__, - rdata->result); + cifs_dbg(FYI, "%s: server returned error %d\n", + __func__, rdata->result); return cifs_readv_discard(server, mid); } /* Is there enough to get to the rest of the READ_RSP header? */ if (server->total_read < server->vals->read_rsp_size) { - cFYI(1, "%s: server returned short header. got=%u expected=%zu", - __func__, server->total_read, - server->vals->read_rsp_size); + cifs_dbg(FYI, "%s: server returned short header. got=%u expected=%zu\n", + __func__, server->total_read, + server->vals->read_rsp_size); rdata->result = -EIO; return cifs_readv_discard(server, mid); } @@ -1468,19 +1454,19 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) * is beyond the EOF. Treat it as if the data starts just after * the header. */ - cFYI(1, "%s: data offset (%u) inside read response header", - __func__, data_offset); + cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n", + __func__, data_offset); data_offset = server->total_read; } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) { /* data_offset is beyond the end of smallbuf */ - cFYI(1, "%s: data offset (%u) beyond end of smallbuf", - __func__, data_offset); + cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n", + __func__, data_offset); rdata->result = -EIO; return cifs_readv_discard(server, mid); } - cFYI(1, "%s: total_read=%u data_offset=%u", __func__, - server->total_read, data_offset); + cifs_dbg(FYI, "%s: total_read=%u data_offset=%u\n", + __func__, server->total_read, data_offset); len = data_offset - server->total_read; if (len > 0) { @@ -1496,8 +1482,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) /* set up first iov for signature check */ rdata->iov.iov_base = buf; rdata->iov.iov_len = server->total_read; - cFYI(1, "0: iov_base=%p iov_len=%zu", - rdata->iov.iov_base, rdata->iov.iov_len); + cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", + rdata->iov.iov_base, rdata->iov.iov_len); /* how much data is in the response? */ data_len = server->ops->read_data_length(buf); @@ -1514,8 +1500,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) server->total_read += length; rdata->bytes = length; - cFYI(1, "total_read=%u buflen=%u remaining=%u", server->total_read, - buflen, data_len); + cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n", + server->total_read, buflen, data_len); /* discard anything left over */ if (server->total_read < buflen) @@ -1538,21 +1524,21 @@ cifs_readv_callback(struct mid_q_entry *mid) .rq_pagesz = rdata->pagesz, .rq_tailsz = rdata->tailsz }; - cFYI(1, "%s: mid=%llu state=%d result=%d bytes=%u", __func__, - mid->mid, mid->mid_state, rdata->result, rdata->bytes); + cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n", + __func__, mid->mid, mid->mid_state, rdata->result, + rdata->bytes); switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: /* result already set, check signature */ - if (server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + if (server->sign) { int rc = 0; rc = cifs_verify_signature(&rqst, server, - mid->sequence_number + 1); + mid->sequence_number); if (rc) - cERROR(1, "SMB signature verification returned " - "error = %d", rc); + cifs_dbg(VFS, "SMB signature verification returned error = %d\n", + rc); } /* FIXME: should this be counted toward the initiating task? */ task_io_account_read(rdata->bytes); @@ -1582,8 +1568,8 @@ cifs_async_readv(struct cifs_readdata *rdata) struct smb_rqst rqst = { .rq_iov = &rdata->iov, .rq_nvec = 1 }; - cFYI(1, "%s: offset=%llu bytes=%u", __func__, - rdata->offset, rdata->bytes); + cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", + __func__, rdata->offset, rdata->bytes); if (tcon->ses->capabilities & CAP_LARGE_FILES) wct = 12; @@ -1653,7 +1639,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, struct cifs_tcon *tcon = io_parms->tcon; unsigned int count = io_parms->length; - cFYI(1, "Reading %d bytes on fid %d", count, netfid); + cifs_dbg(FYI, "Reading %d bytes on fid %d\n", count, netfid); if (tcon->ses->capabilities & CAP_LARGE_FILES) wct = 12; else { @@ -1701,7 +1687,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_stats_inc(&tcon->stats.cifs_stats.num_reads); pSMBr = (READ_RSP *)iov[0].iov_base; if (rc) { - cERROR(1, "Send error in read = %d", rc); + cifs_dbg(VFS, "Send error in read = %d\n", rc); } else { int data_length = le16_to_cpu(pSMBr->DataLengthHigh); data_length = data_length << 16; @@ -1711,7 +1697,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, /*check that DataLength would not go beyond end of SMB */ if ((data_length > CIFSMaxBufSize) || (data_length > count)) { - cFYI(1, "bad length %d for count %d", + cifs_dbg(FYI, "bad length %d for count %d\n", data_length, count); rc = -EIO; *nbytes = 0; @@ -1719,7 +1705,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, pReadData = (char *) (&pSMBr->hdr.Protocol) + le16_to_cpu(pSMBr->DataOffset); /* if (rc = copy_to_user(buf, pReadData, data_length)) { - cERROR(1, "Faulting on read rc = %d",rc); + cifs_dbg(VFS, "Faulting on read rc = %d\n",rc); rc = -EFAULT; }*/ /* can not use copy_to_user when using page cache*/ if (*buf) @@ -1767,7 +1753,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, *nbytes = 0; - /* cFYI(1, "write at %lld %d bytes", offset, count);*/ + /* cifs_dbg(FYI, "write at %lld %d bytes\n", offset, count);*/ if (tcon->ses == NULL) return -ECONNABORTED; @@ -1852,7 +1838,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, (struct smb_hdr *) pSMBr, &bytes_returned, long_op); cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); if (rc) { - cFYI(1, "Send error in write = %d", rc); + cifs_dbg(FYI, "Send error in write = %d\n", rc); } else { *nbytes = le16_to_cpu(pSMBr->CountHigh); *nbytes = (*nbytes) << 16; @@ -1959,7 +1945,7 @@ cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) /* this would overflow */ if (nr_pages == 0) { - cERROR(1, "%s: called with nr_pages == 0!", __func__); + cifs_dbg(VFS, "%s: called with nr_pages == 0!\n", __func__); return NULL; } @@ -2075,7 +2061,8 @@ cifs_async_writev(struct cifs_writedata *wdata) rqst.rq_pagesz = wdata->pagesz; rqst.rq_tailsz = wdata->tailsz; - cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes); + cifs_dbg(FYI, "async write at %llu %u bytes\n", + wdata->offset, wdata->bytes); smb->DataLengthLow = cpu_to_le16(wdata->bytes & 0xFFFF); smb->DataLengthHigh = cpu_to_le16(wdata->bytes >> 16); @@ -2123,7 +2110,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, *nbytes = 0; - cFYI(1, "write2 at %lld %d bytes", (long long)offset, count); + cifs_dbg(FYI, "write2 at %lld %d bytes\n", (long long)offset, count); if (tcon->ses->capabilities & CAP_LARGE_FILES) { wct = 14; @@ -2182,7 +2169,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); if (rc) { - cFYI(1, "Send error Write2 = %d", rc); + cifs_dbg(FYI, "Send error Write2 = %d\n", rc); } else if (resp_buf_type == 0) { /* presumably this can not happen, but best to be safe */ rc = -EIO; @@ -2223,7 +2210,8 @@ int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon, int resp_buf_type; __u16 count; - cFYI(1, "cifs_lockv num lock %d num unlock %d", num_lock, num_unlock); + cifs_dbg(FYI, "cifs_lockv num lock %d num unlock %d\n", + num_lock, num_unlock); rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB); if (rc) @@ -2249,7 +2237,7 @@ int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon, cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP); if (rc) - cFYI(1, "Send error in cifs_lockv = %d", rc); + cifs_dbg(FYI, "Send error in cifs_lockv = %d\n", rc); return rc; } @@ -2268,7 +2256,8 @@ CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon, int flags = 0; __u16 count; - cFYI(1, "CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock); + cifs_dbg(FYI, "CIFSSMBLock timeout %d numLock %d\n", + (int)waitFlag, numLock); rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB); if (rc) @@ -2317,7 +2306,7 @@ CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon, } cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); if (rc) - cFYI(1, "Send error in Lock = %d", rc); + cifs_dbg(FYI, "Send error in Lock = %d\n", rc); /* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@ -2341,7 +2330,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, __u16 params, param_offset, offset, byte_count, count; struct kvec iov[1]; - cFYI(1, "Posix Lock"); + cifs_dbg(FYI, "Posix Lock\n"); rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); @@ -2408,7 +2397,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, } if (rc) { - cFYI(1, "Send error in Posix Lock = %d", rc); + cifs_dbg(FYI, "Send error in Posix Lock = %d\n", rc); } else if (pLockData) { /* lock structure can be returned on get */ __u16 data_offset; @@ -2465,7 +2454,7 @@ CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id) { int rc = 0; CLOSE_REQ *pSMB = NULL; - cFYI(1, "In CIFSSMBClose"); + cifs_dbg(FYI, "In CIFSSMBClose\n"); /* do not retry on dead session on close */ rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB); @@ -2482,7 +2471,7 @@ CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id) if (rc) { if (rc != -EINTR) { /* EINTR is expected when user ctl-c to kill app */ - cERROR(1, "Send error in Close = %d", rc); + cifs_dbg(VFS, "Send error in Close = %d\n", rc); } } @@ -2498,7 +2487,7 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id) { int rc = 0; FLUSH_REQ *pSMB = NULL; - cFYI(1, "In CIFSSMBFlush"); + cifs_dbg(FYI, "In CIFSSMBFlush\n"); rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB); if (rc) @@ -2509,7 +2498,7 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id) rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_flushes); if (rc) - cERROR(1, "Send error in Flush = %d", rc); + cifs_dbg(VFS, "Send error in Flush = %d\n", rc); return rc; } @@ -2527,7 +2516,7 @@ CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, __u16 count; int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; - cFYI(1, "In CIFSSMBRename"); + cifs_dbg(FYI, "In CIFSSMBRename\n"); renameRetry: rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -2574,7 +2563,7 @@ renameRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_renames); if (rc) - cFYI(1, "Send error in rename = %d", rc); + cifs_dbg(FYI, "Send error in rename = %d\n", rc); cifs_buf_release(pSMB); @@ -2598,7 +2587,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon, int len_of_str; __u16 params, param_offset, offset, count, byte_count; - cFYI(1, "Rename to File by handle"); + cifs_dbg(FYI, "Rename to File by handle\n"); rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -2655,7 +2644,8 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon, (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&pTcon->stats.cifs_stats.num_t2renames); if (rc) - cFYI(1, "Send error in Rename (by file handle) = %d", rc); + cifs_dbg(FYI, "Send error in Rename (by file handle) = %d\n", + rc); cifs_buf_release(pSMB); @@ -2677,7 +2667,7 @@ CIFSSMBCopy(const unsigned int xid, struct cifs_tcon *tcon, int name_len, name_len2; __u16 count; - cFYI(1, "In CIFSSMBCopy"); + cifs_dbg(FYI, "In CIFSSMBCopy\n"); copyRetry: rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -2722,8 +2712,8 @@ copyRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in copy = %d with %d files copied", - rc, le16_to_cpu(pSMBr->CopyCount)); + cifs_dbg(FYI, "Send error in copy = %d with %d files copied\n", + rc, le16_to_cpu(pSMBr->CopyCount)); } cifs_buf_release(pSMB); @@ -2747,7 +2737,7 @@ CIFSUnixCreateSymLink(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, param_offset, offset, byte_count; - cFYI(1, "In Symlink Unix style"); + cifs_dbg(FYI, "In Symlink Unix style\n"); createSymLinkRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -2812,7 +2802,8 @@ createSymLinkRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_symlinks); if (rc) - cFYI(1, "Send error in SetPathInfo create symlink = %d", rc); + cifs_dbg(FYI, "Send error in SetPathInfo create symlink = %d\n", + rc); cifs_buf_release(pSMB); @@ -2836,7 +2827,7 @@ CIFSUnixCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, param_offset, offset, byte_count; - cFYI(1, "In Create Hard link Unix style"); + cifs_dbg(FYI, "In Create Hard link Unix style\n"); createHardLinkRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -2898,7 +2889,8 @@ createHardLinkRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_hardlinks); if (rc) - cFYI(1, "Send error in SetPathInfo (hard link) = %d", rc); + cifs_dbg(FYI, "Send error in SetPathInfo (hard link) = %d\n", + rc); cifs_buf_release(pSMB); if (rc == -EAGAIN) @@ -2920,7 +2912,7 @@ CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, __u16 count; int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; - cFYI(1, "In CIFSCreateHardLink"); + cifs_dbg(FYI, "In CIFSCreateHardLink\n"); winCreateHardLinkRetry: rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB, @@ -2972,7 +2964,7 @@ winCreateHardLinkRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_hardlinks); if (rc) - cFYI(1, "Send error in hard link (NT rename) = %d", rc); + cifs_dbg(FYI, "Send error in hard link (NT rename) = %d\n", rc); cifs_buf_release(pSMB); if (rc == -EAGAIN) @@ -2995,7 +2987,7 @@ CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, __u16 params, byte_count; char *data_start; - cFYI(1, "In QPathSymLinkInfo (Unix) for path %s", searchName); + cifs_dbg(FYI, "In QPathSymLinkInfo (Unix) for path %s\n", searchName); querySymLinkRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, @@ -3042,7 +3034,7 @@ querySymLinkRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QuerySymLinkInfo = %d", rc); + cifs_dbg(FYI, "Send error in QuerySymLinkInfo = %d\n", rc); } else { /* decode response */ @@ -3097,7 +3089,8 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon, struct smb_com_transaction_ioctl_req *pSMB; struct smb_com_transaction_ioctl_rsp *pSMBr; - cFYI(1, "In Windows reparse style QueryLink for path %s", searchName); + cifs_dbg(FYI, "In Windows reparse style QueryLink for path %s\n", + searchName); rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -3125,7 +3118,7 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon, rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QueryReparseLinkInfo = %d", rc); + cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc); } else { /* decode response */ __u32 data_offset = le32_to_cpu(pSMBr->DataOffset); __u32 data_count = le32_to_cpu(pSMBr->DataCount); @@ -3149,7 +3142,7 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon, if ((reparse_buf->LinkNamesBuf + reparse_buf->TargetNameOffset + reparse_buf->TargetNameLen) > end_of_smb) { - cFYI(1, "reparse buf beyond SMB"); + cifs_dbg(FYI, "reparse buf beyond SMB\n"); rc = -EIO; goto qreparse_out; } @@ -3170,12 +3163,11 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon, } } else { rc = -EIO; - cFYI(1, "Invalid return data count on " - "get reparse info ioctl"); + cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n"); } symlinkinfo[buflen] = 0; /* just in case so the caller does not go off the end of the buffer */ - cFYI(1, "readlink result - %s", symlinkinfo); + cifs_dbg(FYI, "readlink result - %s\n", symlinkinfo); } qreparse_out: @@ -3198,7 +3190,10 @@ static void cifs_convert_ace(posix_acl_xattr_entry *ace, ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm); ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag); ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid)); - /* cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id); */ +/* + cifs_dbg(FYI, "perm %d tag %d id %d\n", + ace->e_perm, ace->e_tag, ace->e_id); +*/ return; } @@ -3224,8 +3219,8 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, size += sizeof(struct cifs_posix_ace) * count; /* check if we would go beyond end of SMB */ if (size_of_data_area < size) { - cFYI(1, "bad CIFS POSIX ACL size %d vs. %d", - size_of_data_area, size); + cifs_dbg(FYI, "bad CIFS POSIX ACL size %d vs. %d\n", + size_of_data_area, size); return -EINVAL; } } else if (acl_type & ACL_TYPE_DEFAULT) { @@ -3272,7 +3267,10 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace, cifs_ace->cifs_uid = cpu_to_le64(-1); } else cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id)); - /*cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id);*/ +/* + cifs_dbg(FYI, "perm %d tag %d id %d\n", + ace->e_perm, ace->e_tag, ace->e_id); +*/ return rc; } @@ -3290,12 +3288,11 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, return 0; count = posix_acl_xattr_count((size_t)buflen); - cFYI(1, "setting acl with %d entries from buf of length %d and " - "version of %d", - count, buflen, le32_to_cpu(local_acl->a_version)); + cifs_dbg(FYI, "setting acl with %d entries from buf of length %d and version of %d\n", + count, buflen, le32_to_cpu(local_acl->a_version)); if (le32_to_cpu(local_acl->a_version) != 2) { - cFYI(1, "unknown POSIX ACL version %d", - le32_to_cpu(local_acl->a_version)); + cifs_dbg(FYI, "unknown POSIX ACL version %d\n", + le32_to_cpu(local_acl->a_version)); return 0; } cifs_acl->version = cpu_to_le16(1); @@ -3304,7 +3301,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, else if (acl_type == ACL_TYPE_DEFAULT) cifs_acl->default_entry_count = cpu_to_le16(count); else { - cFYI(1, "unknown ACL type %d", acl_type); + cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); return 0; } for (i = 0; i < count; i++) { @@ -3337,7 +3334,7 @@ CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, int name_len; __u16 params, byte_count; - cFYI(1, "In GetPosixACL (Unix) for path %s", searchName); + cifs_dbg(FYI, "In GetPosixACL (Unix) for path %s\n", searchName); queryAclRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, @@ -3390,7 +3387,7 @@ queryAclRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get); if (rc) { - cFYI(1, "Send error in Query POSIX ACL = %d", rc); + cifs_dbg(FYI, "Send error in Query POSIX ACL = %d\n", rc); } else { /* decode response */ @@ -3427,7 +3424,7 @@ CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, byte_count, data_count, param_offset, offset; - cFYI(1, "In SetPosixACL (Unix) for path %s", fileName); + cifs_dbg(FYI, "In SetPosixACL (Unix) for path %s\n", fileName); setAclRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -3482,7 +3479,7 @@ setAclRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) - cFYI(1, "Set POSIX ACL returned %d", rc); + cifs_dbg(FYI, "Set POSIX ACL returned %d\n", rc); setACLerrorExit: cifs_buf_release(pSMB); @@ -3502,7 +3499,7 @@ CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned; __u16 params, byte_count; - cFYI(1, "In GetExtAttr"); + cifs_dbg(FYI, "In GetExtAttr\n"); if (tcon == NULL) return -ENODEV; @@ -3541,7 +3538,7 @@ GetExtAttrRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "error %d in GetExtAttr", rc); + cifs_dbg(FYI, "error %d in GetExtAttr\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -3556,7 +3553,7 @@ GetExtAttrRetry: struct file_chattr_info *pfinfo; /* BB Do we need a cast or hash here ? */ if (count != 16) { - cFYI(1, "Illegal size ret in GetExtAttr"); + cifs_dbg(FYI, "Illegal size ret in GetExtAttr\n"); rc = -EIO; goto GetExtAttrOut; } @@ -3644,21 +3641,21 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata, /* should we also check that parm and data areas do not overlap? */ if (*ppparm > end_of_smb) { - cFYI(1, "parms start after end of smb"); + cifs_dbg(FYI, "parms start after end of smb\n"); return -EINVAL; } else if (parm_count + *ppparm > end_of_smb) { - cFYI(1, "parm end after end of smb"); + cifs_dbg(FYI, "parm end after end of smb\n"); return -EINVAL; } else if (*ppdata > end_of_smb) { - cFYI(1, "data starts after end of smb"); + cifs_dbg(FYI, "data starts after end of smb\n"); return -EINVAL; } else if (data_count + *ppdata > end_of_smb) { - cFYI(1, "data %p + count %d (%p) past smb end %p start %p", - *ppdata, data_count, (data_count + *ppdata), - end_of_smb, pSMBr); + cifs_dbg(FYI, "data %p + count %d (%p) past smb end %p start %p\n", + *ppdata, data_count, (data_count + *ppdata), + end_of_smb, pSMBr); return -EINVAL; } else if (parm_count + data_count > bcc) { - cFYI(1, "parm count and data count larger than SMB"); + cifs_dbg(FYI, "parm count and data count larger than SMB\n"); return -EINVAL; } *pdatalen = data_count; @@ -3676,7 +3673,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, QUERY_SEC_DESC_REQ *pSMB; struct kvec iov[1]; - cFYI(1, "GetCifsACL"); + cifs_dbg(FYI, "GetCifsACL\n"); *pbuflen = 0; *acl_inf = NULL; @@ -3701,7 +3698,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get); if (rc) { - cFYI(1, "Send error in QuerySecDesc = %d", rc); + cifs_dbg(FYI, "Send error in QuerySecDesc = %d\n", rc); } else { /* decode response */ __le32 *parm; __u32 parm_len; @@ -3716,7 +3713,8 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, goto qsec_out; pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base; - cFYI(1, "smb %p parm %p data %p", pSMBr, parm, *acl_inf); + cifs_dbg(FYI, "smb %p parm %p data %p\n", + pSMBr, parm, *acl_inf); if (le32_to_cpu(pSMBr->ParameterCount) != 4) { rc = -EIO; /* bad smb */ @@ -3728,8 +3726,8 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, acl_len = le32_to_cpu(*parm); if (acl_len != *pbuflen) { - cERROR(1, "acl length %d does not match %d", - acl_len, *pbuflen); + cifs_dbg(VFS, "acl length %d does not match %d\n", + acl_len, *pbuflen); if (*pbuflen > acl_len) *pbuflen = acl_len; } @@ -3738,16 +3736,15 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, header followed by the smallest SID */ if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) || (*pbuflen >= 64 * 1024)) { - cERROR(1, "bad acl length %d", *pbuflen); + cifs_dbg(VFS, "bad acl length %d\n", *pbuflen); rc = -EINVAL; *pbuflen = 0; } else { - *acl_inf = kmalloc(*pbuflen, GFP_KERNEL); + *acl_inf = kmemdup(pdata, *pbuflen, GFP_KERNEL); if (*acl_inf == NULL) { *pbuflen = 0; rc = -ENOMEM; } - memcpy(*acl_inf, pdata, *pbuflen); } } qsec_out: @@ -3809,9 +3806,10 @@ setCifsAclRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); - cFYI(1, "SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc); + cifs_dbg(FYI, "SetCIFSACL bytes_returned: %d, rc: %d\n", + bytes_returned, rc); if (rc) - cFYI(1, "Set CIFS ACL returned %d", rc); + cifs_dbg(FYI, "Set CIFS ACL returned %d\n", rc); cifs_buf_release(pSMB); if (rc == -EAGAIN) @@ -3835,7 +3833,7 @@ SMBQueryInformation(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned; int name_len; - cFYI(1, "In SMBQPath path %s", search_name); + cifs_dbg(FYI, "In SMBQPath path %s\n", search_name); QInfRetry: rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -3862,7 +3860,7 @@ QInfRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QueryInfo = %d", rc); + cifs_dbg(FYI, "Send error in QueryInfo = %d\n", rc); } else if (data) { struct timespec ts; __u32 time = le32_to_cpu(pSMBr->last_write_time); @@ -3932,11 +3930,12 @@ QFileInfoRetry: pSMB->Pad = 0; pSMB->Fid = netfid; inc_rfc1001_len(pSMB, byte_count); + pSMB->t2.ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QPathInfo = %d", rc); + cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -3973,7 +3972,7 @@ CIFSSMBQPathInfo(const unsigned int xid, struct cifs_tcon *tcon, int name_len; __u16 params, byte_count; - /* cFYI(1, "In QPathInfo path %s", search_name); */ + /* cifs_dbg(FYI, "In QPathInfo path %s\n", search_name); */ QPathInfoRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -4023,7 +4022,7 @@ QPathInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QPathInfo = %d", rc); + cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4100,18 +4099,17 @@ UnixQFileInfoRetry: pSMB->Pad = 0; pSMB->Fid = netfid; inc_rfc1001_len(pSMB, byte_count); + pSMB->t2.ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QPathInfo = %d", rc); + cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) { - cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response. " - "Unix Extensions can be disabled on mount " - "by specifying the nosfu mount option."); + cifs_dbg(VFS, "Malformed FILE_UNIX_BASIC_INFO response. Unix Extensions can be disabled on mount by specifying the nosfu mount option.\n"); rc = -EIO; /* bad smb */ } else { __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); @@ -4143,7 +4141,7 @@ CIFSSMBUnixQPathInfo(const unsigned int xid, struct cifs_tcon *tcon, int name_len; __u16 params, byte_count; - cFYI(1, "In QPathInfo (Unix) the path %s", searchName); + cifs_dbg(FYI, "In QPathInfo (Unix) the path %s\n", searchName); UnixQPathInfoRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -4190,14 +4188,12 @@ UnixQPathInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QPathInfo = %d", rc); + cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) { - cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response. " - "Unix Extensions can be disabled on mount " - "by specifying the nosfu mount option."); + cifs_dbg(VFS, "Malformed FILE_UNIX_BASIC_INFO response. Unix Extensions can be disabled on mount by specifying the nosfu mount option.\n"); rc = -EIO; /* bad smb */ } else { __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); @@ -4231,7 +4227,7 @@ CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon, __u16 params, byte_count; struct nls_table *nls_codepage; - cFYI(1, "In FindFirst for %s", searchName); + cifs_dbg(FYI, "In FindFirst for %s\n", searchName); findFirstRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, @@ -4314,7 +4310,7 @@ findFirstRetry: if (rc) {/* BB add logic to retry regular search if Unix search rejected unexpectedly by server */ /* BB Add code to handle unsupported level rc */ - cFYI(1, "Error in FindFirst = %d", rc); + cifs_dbg(FYI, "Error in FindFirst = %d\n", rc); cifs_buf_release(pSMB); @@ -4352,7 +4348,7 @@ findFirstRetry: psrch_inf->entries_in_buffer; lnoff = le16_to_cpu(parms->LastNameOffset); if (CIFSMaxBufSize < lnoff) { - cERROR(1, "ignoring corrupt resume name"); + cifs_dbg(VFS, "ignoring corrupt resume name\n"); psrch_inf->last_entry = NULL; return rc; } @@ -4383,7 +4379,7 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, unsigned int name_len; __u16 params, byte_count; - cFYI(1, "In FindNext"); + cifs_dbg(FYI, "In FindNext\n"); if (psrch_inf->endOfSearch) return -ENOENT; @@ -4444,7 +4440,7 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, cifs_buf_release(pSMB); rc = 0; /* search probably was closed at end of search*/ } else - cFYI(1, "FindNext returned = %d", rc); + cifs_dbg(FYI, "FindNext returned = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4479,15 +4475,15 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, psrch_inf->entries_in_buffer; lnoff = le16_to_cpu(parms->LastNameOffset); if (CIFSMaxBufSize < lnoff) { - cERROR(1, "ignoring corrupt resume name"); + cifs_dbg(VFS, "ignoring corrupt resume name\n"); psrch_inf->last_entry = NULL; return rc; } else psrch_inf->last_entry = psrch_inf->srch_entries_start + lnoff; -/* cFYI(1, "fnxt2 entries in buf %d index_of_last %d", - psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */ +/* cifs_dbg(FYI, "fnxt2 entries in buf %d index_of_last %d\n", + psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */ /* BB fixme add unlock here */ } @@ -4512,7 +4508,7 @@ CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; FINDCLOSE_REQ *pSMB = NULL; - cFYI(1, "In CIFSSMBFindClose"); + cifs_dbg(FYI, "In CIFSSMBFindClose\n"); rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB); /* no sense returning error if session restarted @@ -4526,7 +4522,7 @@ CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon, pSMB->ByteCount = 0; rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); if (rc) - cERROR(1, "Send error in FindClose = %d", rc); + cifs_dbg(VFS, "Send error in FindClose = %d\n", rc); cifs_stats_inc(&tcon->stats.cifs_stats.num_fclose); @@ -4548,7 +4544,7 @@ CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon, int name_len, bytes_returned; __u16 params, byte_count; - cFYI(1, "In GetSrvInodeNum for %s", search_name); + cifs_dbg(FYI, "In GetSrvInodeNum for %s\n", search_name); if (tcon == NULL) return -ENODEV; @@ -4599,7 +4595,7 @@ GetInodeNumberRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "error %d in QueryInternalInfo", rc); + cifs_dbg(FYI, "error %d in QueryInternalInfo\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4614,7 +4610,7 @@ GetInodeNumberRetry: struct file_internal_info *pfinfo; /* BB Do we need a cast or hash here ? */ if (count < 8) { - cFYI(1, "Illegal size ret in QryIntrnlInf"); + cifs_dbg(FYI, "Illegal size ret in QryIntrnlInf\n"); rc = -EIO; goto GetInodeNumOut; } @@ -4655,16 +4651,16 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals); if (*num_of_nodes < 1) { - cERROR(1, "num_referrals: must be at least > 0," - "but we get num_referrals = %d", *num_of_nodes); + cifs_dbg(VFS, "num_referrals: must be at least > 0, but we get num_referrals = %d\n", + *num_of_nodes); rc = -EINVAL; goto parse_DFS_referrals_exit; } ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals); if (ref->VersionNumber != cpu_to_le16(3)) { - cERROR(1, "Referrals of V%d version are not supported," - "should be V3", le16_to_cpu(ref->VersionNumber)); + cifs_dbg(VFS, "Referrals of V%d version are not supported, should be V3\n", + le16_to_cpu(ref->VersionNumber)); rc = -EINVAL; goto parse_DFS_referrals_exit; } @@ -4673,14 +4669,12 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, data_end = (char *)(&(pSMBr->PathConsumed)) + le16_to_cpu(pSMBr->t2.DataCount); - cFYI(1, "num_referrals: %d dfs flags: 0x%x ...", - *num_of_nodes, - le32_to_cpu(pSMBr->DFSFlags)); + cifs_dbg(FYI, "num_referrals: %d dfs flags: 0x%x ...\n", + *num_of_nodes, le32_to_cpu(pSMBr->DFSFlags)); - *target_nodes = kzalloc(sizeof(struct dfs_info3_param) * - *num_of_nodes, GFP_KERNEL); + *target_nodes = kcalloc(*num_of_nodes, sizeof(struct dfs_info3_param), + GFP_KERNEL); if (*target_nodes == NULL) { - cERROR(1, "Failed to allocate buffer for target_nodes"); rc = -ENOMEM; goto parse_DFS_referrals_exit; } @@ -4759,7 +4753,7 @@ CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses, *num_of_nodes = 0; *target_nodes = NULL; - cFYI(1, "In GetDFSRefer the path %s", search_name); + cifs_dbg(FYI, "In GetDFSRefer the path %s\n", search_name); if (ses == NULL) return -ENODEV; getDFSRetry: @@ -4792,11 +4786,8 @@ getDFSRetry: strncpy(pSMB->RequestFileName, search_name, name_len); } - if (ses->server) { - if (ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; - } + if (ses->server && ses->server->sign) + pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; pSMB->hdr.Uid = ses->Suid; @@ -4827,7 +4818,7 @@ getDFSRetry: rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in GetDFSRefer = %d", rc); + cifs_dbg(FYI, "Send error in GetDFSRefer = %d\n", rc); goto GetDFSRefExit; } rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4838,9 +4829,8 @@ getDFSRetry: goto GetDFSRefExit; } - cFYI(1, "Decoding GetDFSRefer response BCC: %d Offset %d", - get_bcc(&pSMBr->hdr), - le16_to_cpu(pSMBr->t2.DataOffset)); + cifs_dbg(FYI, "Decoding GetDFSRefer response BCC: %d Offset %d\n", + get_bcc(&pSMBr->hdr), le16_to_cpu(pSMBr->t2.DataOffset)); /* parse returned result into more usable form */ rc = parse_DFS_referrals(pSMBr, num_of_nodes, @@ -4869,7 +4859,7 @@ SMBOldQFSInfo(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, byte_count; - cFYI(1, "OldQFSInfo"); + cifs_dbg(FYI, "OldQFSInfo\n"); oldQFSInfoRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -4902,7 +4892,7 @@ oldQFSInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QFSInfo = %d", rc); + cifs_dbg(FYI, "Send error in QFSInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4910,7 +4900,7 @@ oldQFSInfoRetry: rc = -EIO; /* bad smb */ else { __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); - cFYI(1, "qfsinf resp BCC: %d Offset %d", + cifs_dbg(FYI, "qfsinf resp BCC: %d Offset %d\n", get_bcc(&pSMBr->hdr), data_offset); response_data = (FILE_SYSTEM_ALLOC_INFO *) @@ -4923,10 +4913,10 @@ oldQFSInfoRetry: le32_to_cpu(response_data->TotalAllocationUnits); FSData->f_bfree = FSData->f_bavail = le32_to_cpu(response_data->FreeAllocationUnits); - cFYI(1, "Blocks: %lld Free: %lld Block size %ld", - (unsigned long long)FSData->f_blocks, - (unsigned long long)FSData->f_bfree, - FSData->f_bsize); + cifs_dbg(FYI, "Blocks: %lld Free: %lld Block size %ld\n", + (unsigned long long)FSData->f_blocks, + (unsigned long long)FSData->f_bfree, + FSData->f_bsize); } } cifs_buf_release(pSMB); @@ -4949,7 +4939,7 @@ CIFSSMBQFSInfo(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, byte_count; - cFYI(1, "In QFSInfo"); + cifs_dbg(FYI, "In QFSInfo\n"); QFSInfoRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -4982,7 +4972,7 @@ QFSInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QFSInfo = %d", rc); + cifs_dbg(FYI, "Send error in QFSInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -5003,10 +4993,10 @@ QFSInfoRetry: le64_to_cpu(response_data->TotalAllocationUnits); FSData->f_bfree = FSData->f_bavail = le64_to_cpu(response_data->FreeAllocationUnits); - cFYI(1, "Blocks: %lld Free: %lld Block size %ld", - (unsigned long long)FSData->f_blocks, - (unsigned long long)FSData->f_bfree, - FSData->f_bsize); + cifs_dbg(FYI, "Blocks: %lld Free: %lld Block size %ld\n", + (unsigned long long)FSData->f_blocks, + (unsigned long long)FSData->f_bfree, + FSData->f_bsize); } } cifs_buf_release(pSMB); @@ -5028,7 +5018,7 @@ CIFSSMBQFSAttributeInfo(const unsigned int xid, struct cifs_tcon *tcon) int bytes_returned = 0; __u16 params, byte_count; - cFYI(1, "In QFSAttributeInfo"); + cifs_dbg(FYI, "In QFSAttributeInfo\n"); QFSAttributeRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -5062,7 +5052,7 @@ QFSAttributeRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cERROR(1, "Send error in QFSAttributeInfo = %d", rc); + cifs_dbg(VFS, "Send error in QFSAttributeInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -5098,7 +5088,7 @@ CIFSSMBQFSDeviceInfo(const unsigned int xid, struct cifs_tcon *tcon) int bytes_returned = 0; __u16 params, byte_count; - cFYI(1, "In QFSDeviceInfo"); + cifs_dbg(FYI, "In QFSDeviceInfo\n"); QFSDeviceRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -5133,7 +5123,7 @@ QFSDeviceRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QFSDeviceInfo = %d", rc); + cifs_dbg(FYI, "Send error in QFSDeviceInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -5169,7 +5159,7 @@ CIFSSMBQFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon) int bytes_returned = 0; __u16 params, byte_count; - cFYI(1, "In QFSUnixInfo"); + cifs_dbg(FYI, "In QFSUnixInfo\n"); QFSUnixRetry: rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -5203,7 +5193,7 @@ QFSUnixRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cERROR(1, "Send error in QFSUnixInfo = %d", rc); + cifs_dbg(VFS, "Send error in QFSUnixInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -5238,7 +5228,7 @@ CIFSSMBSetFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon, __u64 cap) int bytes_returned = 0; __u16 params, param_offset, offset, byte_count; - cFYI(1, "In SETFSUnixInfo"); + cifs_dbg(FYI, "In SETFSUnixInfo\n"); SETFSUnixRetry: /* BB switch to small buf init to save memory */ rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, @@ -5286,7 +5276,7 @@ SETFSUnixRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cERROR(1, "Send error in SETFSUnixInfo = %d", rc); + cifs_dbg(VFS, "Send error in SETFSUnixInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); if (rc) @@ -5314,7 +5304,7 @@ CIFSSMBQFSPosixInfo(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, byte_count; - cFYI(1, "In QFSPosixInfo"); + cifs_dbg(FYI, "In QFSPosixInfo\n"); QFSPosixRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -5348,7 +5338,7 @@ QFSPosixRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QFSUnixInfo = %d", rc); + cifs_dbg(FYI, "Send error in QFSUnixInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -5410,7 +5400,7 @@ CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon, __u16 params, byte_count, data_count, param_offset, offset; - cFYI(1, "In SetEOF"); + cifs_dbg(FYI, "In SetEOF\n"); SetEOFRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -5476,7 +5466,7 @@ SetEOFRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) - cFYI(1, "SetPathInfo (file size) returned %d", rc); + cifs_dbg(FYI, "SetPathInfo (file size) returned %d\n", rc); cifs_buf_release(pSMB); @@ -5495,8 +5485,8 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; __u16 params, param_offset, offset, byte_count, count; - cFYI(1, "SetFileSize (via SetFileInfo) %lld", - (long long)size); + cifs_dbg(FYI, "SetFileSize (via SetFileInfo) %lld\n", + (long long)size); rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); if (rc) @@ -5553,7 +5543,8 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); if (rc) { - cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc); + cifs_dbg(FYI, "Send error in SetFileInfo (SetFileSize) = %d\n", + rc); } /* Note: On -EAGAIN error only caller can retry on handle based calls @@ -5577,7 +5568,7 @@ CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; __u16 params, param_offset, offset, byte_count, count; - cFYI(1, "Set Times (via SetFileInfo)"); + cifs_dbg(FYI, "Set Times (via SetFileInfo)\n"); rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); if (rc) @@ -5623,7 +5614,8 @@ CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon, memcpy(data_offset, data, sizeof(FILE_BASIC_INFO)); rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); if (rc) - cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc); + cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n", + rc); /* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@ -5640,7 +5632,7 @@ CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; __u16 params, param_offset, offset, byte_count, count; - cFYI(1, "Set File Disposition (via SetFileInfo)"); + cifs_dbg(FYI, "Set File Disposition (via SetFileInfo)\n"); rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); if (rc) @@ -5682,7 +5674,7 @@ CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon, *data_offset = delete_file ? 1 : 0; rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); if (rc) - cFYI(1, "Send error in SetFileDisposition = %d", rc); + cifs_dbg(FYI, "Send error in SetFileDisposition = %d\n", rc); return rc; } @@ -5700,7 +5692,7 @@ CIFSSMBSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon, char *data_offset; __u16 params, param_offset, offset, byte_count, count; - cFYI(1, "In SetTimes"); + cifs_dbg(FYI, "In SetTimes\n"); SetTimesRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, @@ -5756,7 +5748,7 @@ SetTimesRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) - cFYI(1, "SetPathInfo (times) returned %d", rc); + cifs_dbg(FYI, "SetPathInfo (times) returned %d\n", rc); cifs_buf_release(pSMB); @@ -5781,7 +5773,7 @@ CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, char *fileName, int bytes_returned; int name_len; - cFYI(1, "In SetAttrLegacy"); + cifs_dbg(FYI, "In SetAttrLegacy\n"); SetAttrLgcyRetry: rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB, @@ -5807,7 +5799,7 @@ SetAttrLgcyRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) - cFYI(1, "Error in LegacySetAttr = %d", rc); + cifs_dbg(FYI, "Error in LegacySetAttr = %d\n", rc); cifs_buf_release(pSMB); @@ -5875,7 +5867,7 @@ CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; u16 params, param_offset, offset, byte_count, count; - cFYI(1, "Set Unix Info (via SetFileInfo)"); + cifs_dbg(FYI, "Set Unix Info (via SetFileInfo)\n"); rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); if (rc) @@ -5921,7 +5913,8 @@ CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon, rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); if (rc) - cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc); + cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n", + rc); /* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@ -5943,7 +5936,7 @@ CIFSSMBUnixSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon, FILE_UNIX_BASIC_INFO *data_offset; __u16 params, param_offset, offset, count, byte_count; - cFYI(1, "In SetUID/GID/Mode"); + cifs_dbg(FYI, "In SetUID/GID/Mode\n"); setPermsRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -5999,7 +5992,7 @@ setPermsRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) - cFYI(1, "SetPathInfo (perms) returned %d", rc); + cifs_dbg(FYI, "SetPathInfo (perms) returned %d\n", rc); cifs_buf_release(pSMB); if (rc == -EAGAIN) @@ -6036,7 +6029,7 @@ CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon, __u16 params, byte_count, data_offset; unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0; - cFYI(1, "In Query All EAs path %s", searchName); + cifs_dbg(FYI, "In Query All EAs path %s\n", searchName); QAllEAsRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -6083,7 +6076,7 @@ QAllEAsRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QueryAllEAs = %d", rc); + cifs_dbg(FYI, "Send error in QueryAllEAs = %d\n", rc); goto QAllEAsOut; } @@ -6111,16 +6104,16 @@ QAllEAsRetry: (((char *) &pSMBr->hdr.Protocol) + data_offset); list_len = le32_to_cpu(ea_response_data->list_len); - cFYI(1, "ea length %d", list_len); + cifs_dbg(FYI, "ea length %d\n", list_len); if (list_len <= 8) { - cFYI(1, "empty EA list returned from server"); + cifs_dbg(FYI, "empty EA list returned from server\n"); goto QAllEAsOut; } /* make sure list_len doesn't go past end of SMB */ end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr); if ((char *)ea_response_data + list_len > end_of_smb) { - cFYI(1, "EA list appears to go beyond SMB"); + cifs_dbg(FYI, "EA list appears to go beyond SMB\n"); rc = -EIO; goto QAllEAsOut; } @@ -6137,7 +6130,7 @@ QAllEAsRetry: temp_ptr += 4; /* make sure we can read name_len and value_len */ if (list_len < 0) { - cFYI(1, "EA entry goes beyond length of list"); + cifs_dbg(FYI, "EA entry goes beyond length of list\n"); rc = -EIO; goto QAllEAsOut; } @@ -6146,7 +6139,7 @@ QAllEAsRetry: value_len = le16_to_cpu(temp_fea->value_len); list_len -= name_len + 1 + value_len; if (list_len < 0) { - cFYI(1, "EA entry goes beyond length of list"); + cifs_dbg(FYI, "EA entry goes beyond length of list\n"); rc = -EIO; goto QAllEAsOut; } @@ -6214,7 +6207,7 @@ CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, param_offset, byte_count, offset, count; - cFYI(1, "In SetEA"); + cifs_dbg(FYI, "In SetEA\n"); SetEARetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -6296,7 +6289,7 @@ SetEARetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) - cFYI(1, "SetPathInfo (EA) returned %d", rc); + cifs_dbg(FYI, "SetPathInfo (EA) returned %d\n", rc); cifs_buf_release(pSMB); @@ -6339,7 +6332,7 @@ int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon, struct dir_notify_req *dnotify_req; int bytes_returned; - cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid); + cifs_dbg(FYI, "In CIFSSMBNotify for file handle %d\n", (int)netfid); rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -6368,7 +6361,7 @@ int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon, (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_ASYNC_OP); if (rc) { - cFYI(1, "Error in Notify = %d", rc); + cifs_dbg(FYI, "Error in Notify = %d\n", rc); } else { /* Add file to outstanding requests */ /* BB change to kmem cache alloc */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 21b3a291c327..afcb8a1a33b7 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -85,7 +85,7 @@ enum { Opt_acl, Opt_noacl, Opt_locallease, Opt_sign, Opt_seal, Opt_noac, Opt_fsc, Opt_mfsymlinks, - Opt_multiuser, Opt_sloppy, + Opt_multiuser, Opt_sloppy, Opt_nosharesock, /* Mount options which take numeric value */ Opt_backupuid, Opt_backupgid, Opt_uid, @@ -95,9 +95,7 @@ enum { /* Mount options which take string value */ Opt_user, Opt_pass, Opt_ip, - Opt_unc, Opt_domain, - Opt_srcaddr, Opt_prefixpath, - Opt_iocharset, + Opt_domain, Opt_srcaddr, Opt_iocharset, Opt_netbiosname, Opt_servern, Opt_ver, Opt_vers, Opt_sec, Opt_cache, @@ -167,6 +165,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_mfsymlinks, "mfsymlinks" }, { Opt_multiuser, "multiuser" }, { Opt_sloppy, "sloppy" }, + { Opt_nosharesock, "nosharesock" }, { Opt_backupuid, "backupuid=%s" }, { Opt_backupgid, "backupgid=%s" }, @@ -193,14 +192,14 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_blank_ip, "addr=" }, { Opt_ip, "ip=%s" }, { Opt_ip, "addr=%s" }, - { Opt_unc, "unc=%s" }, - { Opt_unc, "target=%s" }, - { Opt_unc, "path=%s" }, + { Opt_ignore, "unc=%s" }, + { Opt_ignore, "target=%s" }, + { Opt_ignore, "path=%s" }, { Opt_domain, "dom=%s" }, { Opt_domain, "domain=%s" }, { Opt_domain, "workgroup=%s" }, { Opt_srcaddr, "srcaddr=%s" }, - { Opt_prefixpath, "prefixpath=%s" }, + { Opt_ignore, "prefixpath=%s" }, { Opt_iocharset, "iocharset=%s" }, { Opt_netbiosname, "netbiosname=%s" }, { Opt_servern, "servern=%s" }, @@ -277,6 +276,7 @@ static const match_table_t cifs_smb_version_tokens = { { Smb_20, SMB20_VERSION_STRING}, { Smb_21, SMB21_VERSION_STRING }, { Smb_30, SMB30_VERSION_STRING }, + { Smb_302, SMB302_VERSION_STRING }, }; static int ip_connect(struct TCP_Server_Info *server); @@ -318,11 +318,12 @@ cifs_reconnect(struct TCP_Server_Info *server) server->max_read = 0; #endif - cFYI(1, "Reconnecting tcp session"); + cifs_dbg(FYI, "Reconnecting tcp session\n"); /* before reconnecting the tcp session, mark the smb session (uid) and the tid bad so they are not used until reconnected */ - cFYI(1, "%s: marking sessions and tcons for reconnect", __func__); + cifs_dbg(FYI, "%s: marking sessions and tcons for reconnect\n", + __func__); spin_lock(&cifs_tcp_ses_lock); list_for_each(tmp, &server->smb_ses_list) { ses = list_entry(tmp, struct cifs_ses, smb_ses_list); @@ -336,15 +337,14 @@ cifs_reconnect(struct TCP_Server_Info *server) spin_unlock(&cifs_tcp_ses_lock); /* do not want to be sending data on a socket we are freeing */ - cFYI(1, "%s: tearing down socket", __func__); + cifs_dbg(FYI, "%s: tearing down socket\n", __func__); mutex_lock(&server->srv_mutex); if (server->ssocket) { - cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state, - server->ssocket->flags); + cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n", + server->ssocket->state, server->ssocket->flags); kernel_sock_shutdown(server->ssocket, SHUT_WR); - cFYI(1, "Post shutdown state: 0x%x Flags: 0x%lx", - server->ssocket->state, - server->ssocket->flags); + cifs_dbg(FYI, "Post shutdown state: 0x%x Flags: 0x%lx\n", + server->ssocket->state, server->ssocket->flags); sock_release(server->ssocket); server->ssocket = NULL; } @@ -358,7 +358,7 @@ cifs_reconnect(struct TCP_Server_Info *server) /* mark submitted MIDs for retry and issue callback */ INIT_LIST_HEAD(&retry_list); - cFYI(1, "%s: moving mids to private list", __func__); + cifs_dbg(FYI, "%s: moving mids to private list\n", __func__); spin_lock(&GlobalMid_Lock); list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); @@ -368,7 +368,7 @@ cifs_reconnect(struct TCP_Server_Info *server) } spin_unlock(&GlobalMid_Lock); - cFYI(1, "%s: issuing mid callbacks", __func__); + cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__); list_for_each_safe(tmp, tmp2, &retry_list) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); list_del_init(&mid_entry->qhead); @@ -381,7 +381,7 @@ cifs_reconnect(struct TCP_Server_Info *server) /* we should try only the port we connected to before */ rc = generic_ip_connect(server); if (rc) { - cFYI(1, "reconnect error %d", rc); + cifs_dbg(FYI, "reconnect error %d\n", rc); msleep(3000); } else { atomic_inc(&tcpSesReconnectCount); @@ -415,8 +415,8 @@ cifs_echo_request(struct work_struct *work) rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS; if (rc) - cFYI(1, "Unable to send echo request to server: %s", - server->hostname); + cifs_dbg(FYI, "Unable to send echo request to server: %s\n", + server->hostname); requeue_echo: queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL); @@ -428,7 +428,7 @@ allocate_buffers(struct TCP_Server_Info *server) if (!server->bigbuf) { server->bigbuf = (char *)cifs_buf_get(); if (!server->bigbuf) { - cERROR(1, "No memory for large SMB response"); + cifs_dbg(VFS, "No memory for large SMB response\n"); msleep(3000); /* retry will check if exiting */ return false; @@ -441,7 +441,7 @@ allocate_buffers(struct TCP_Server_Info *server) if (!server->smallbuf) { server->smallbuf = (char *)cifs_small_buf_get(); if (!server->smallbuf) { - cERROR(1, "No memory for SMB response"); + cifs_dbg(VFS, "No memory for SMB response\n"); msleep(1000); /* retry will check if exiting */ return false; @@ -471,9 +471,8 @@ server_unresponsive(struct TCP_Server_Info *server) */ if (server->tcpStatus == CifsGood && time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) { - cERROR(1, "Server %s has not responded in %d seconds. " - "Reconnecting...", server->hostname, - (2 * SMB_ECHO_INTERVAL) / HZ); + cifs_dbg(VFS, "Server %s has not responded in %d seconds. Reconnecting...\n", + server->hostname, (2 * SMB_ECHO_INTERVAL) / HZ); cifs_reconnect(server); wake_up(&server->response_q); return true; @@ -584,8 +583,8 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, length = 0; continue; } else if (length <= 0) { - cFYI(1, "Received no data or error: expecting %d " - "got %d", to_read, length); + cifs_dbg(FYI, "Received no data or error: expecting %d\n" + "got %d", to_read, length); cifs_reconnect(server); total_read = -EAGAIN; break; @@ -619,17 +618,17 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type) /* Regular SMB response */ return true; case RFC1002_SESSION_KEEP_ALIVE: - cFYI(1, "RFC 1002 session keep alive"); + cifs_dbg(FYI, "RFC 1002 session keep alive\n"); break; case RFC1002_POSITIVE_SESSION_RESPONSE: - cFYI(1, "RFC 1002 positive session response"); + cifs_dbg(FYI, "RFC 1002 positive session response\n"); break; case RFC1002_NEGATIVE_SESSION_RESPONSE: /* * We get this from Windows 98 instead of an error on * SMB negprot response. */ - cFYI(1, "RFC 1002 negative session response"); + cifs_dbg(FYI, "RFC 1002 negative session response\n"); /* give server a second to clean up */ msleep(1000); /* @@ -643,7 +642,7 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type) wake_up(&server->response_q); break; default: - cERROR(1, "RFC 1002 unknown response type 0x%x", type); + cifs_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type); cifs_reconnect(server); } @@ -729,7 +728,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) spin_lock(&GlobalMid_Lock); list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - cFYI(1, "Clearing mid 0x%llx", mid_entry->mid); + cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid); mid_entry->mid_state = MID_SHUTDOWN; list_move(&mid_entry->qhead, &dispose_list); } @@ -738,7 +737,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) /* now walk dispose list and issue callbacks */ list_for_each_safe(tmp, tmp2, &dispose_list) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - cFYI(1, "Callback mid 0x%llx", mid_entry->mid); + cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid); list_del_init(&mid_entry->qhead); mid_entry->callback(mid_entry); } @@ -755,7 +754,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) * least 45 seconds before giving up on a request getting a * response and going ahead and killing cifsd. */ - cFYI(1, "Wait for exit from demultiplex thread"); + cifs_dbg(FYI, "Wait for exit from demultiplex thread\n"); msleep(46000); /* * If threads still have not exited they are probably never @@ -782,8 +781,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) /* make sure this will fit in a large buffer */ if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - 4) { - cERROR(1, "SMB response too long (%u bytes)", - pdu_length); + cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length); cifs_reconnect(server); wake_up(&server->response_q); return -EAGAIN; @@ -841,7 +839,7 @@ cifs_demultiplex_thread(void *p) struct mid_q_entry *mid_entry; current->flags |= PF_MEMALLOC; - cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); + cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current)); length = atomic_inc_return(&tcpSesAllocCount); if (length > 1) @@ -871,14 +869,14 @@ cifs_demultiplex_thread(void *p) */ pdu_length = get_rfc1002_length(buf); - cFYI(1, "RFC1002 header 0x%x", pdu_length); + cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length); if (!is_smb_response(server, buf[0])) continue; /* make sure we have enough to get to the MID */ if (pdu_length < HEADER_SIZE(server) - 1 - 4) { - cERROR(1, "SMB response too short (%u bytes)", - pdu_length); + cifs_dbg(VFS, "SMB response too short (%u bytes)\n", + pdu_length); cifs_reconnect(server); wake_up(&server->response_q); continue; @@ -910,8 +908,8 @@ cifs_demultiplex_thread(void *p) mid_entry->callback(mid_entry); } else if (!server->ops->is_oplock_break || !server->ops->is_oplock_break(buf, server)) { - cERROR(1, "No task to wake, unknown frame received! " - "NumMids %d", atomic_read(&midCount)); + cifs_dbg(VFS, "No task to wake, unknown frame received! NumMids %d\n", + atomic_read(&midCount)); cifs_dump_mem("Received Data is: ", buf, HEADER_SIZE(server)); #ifdef CONFIG_CIFS_DEBUG2 @@ -1028,46 +1026,51 @@ static int cifs_parse_security_flavors(char *value, substring_t args[MAX_OPT_ARGS]; + /* + * With mount options, the last one should win. Reset any existing + * settings back to default. + */ + vol->sectype = Unspecified; + vol->sign = false; + switch (match_token(value, cifs_secflavor_tokens, args)) { - case Opt_sec_krb5: - vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_SIGN; - break; - case Opt_sec_krb5i: - vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MUST_SIGN; - break; case Opt_sec_krb5p: - /* vol->secFlg |= CIFSSEC_MUST_SEAL | CIFSSEC_MAY_KRB5; */ - cERROR(1, "Krb5 cifs privacy not supported"); - break; - case Opt_sec_ntlmssp: - vol->secFlg |= CIFSSEC_MAY_NTLMSSP; + cifs_dbg(VFS, "sec=krb5p is not supported!\n"); + return 1; + case Opt_sec_krb5i: + vol->sign = true; + /* Fallthrough */ + case Opt_sec_krb5: + vol->sectype = Kerberos; break; case Opt_sec_ntlmsspi: - vol->secFlg |= CIFSSEC_MAY_NTLMSSP | CIFSSEC_MUST_SIGN; - break; - case Opt_ntlm: - /* ntlm is default so can be turned off too */ - vol->secFlg |= CIFSSEC_MAY_NTLM; + vol->sign = true; + /* Fallthrough */ + case Opt_sec_ntlmssp: + vol->sectype = RawNTLMSSP; break; case Opt_sec_ntlmi: - vol->secFlg |= CIFSSEC_MAY_NTLM | CIFSSEC_MUST_SIGN; - break; - case Opt_sec_ntlmv2: - vol->secFlg |= CIFSSEC_MAY_NTLMV2; + vol->sign = true; + /* Fallthrough */ + case Opt_ntlm: + vol->sectype = NTLM; break; case Opt_sec_ntlmv2i: - vol->secFlg |= CIFSSEC_MAY_NTLMV2 | CIFSSEC_MUST_SIGN; + vol->sign = true; + /* Fallthrough */ + case Opt_sec_ntlmv2: + vol->sectype = NTLMv2; break; #ifdef CONFIG_CIFS_WEAK_PW_HASH case Opt_sec_lanman: - vol->secFlg |= CIFSSEC_MAY_LANMAN; + vol->sectype = LANMAN; break; #endif case Opt_sec_none: vol->nullauth = 1; break; default: - cERROR(1, "bad security option: %s", value); + cifs_dbg(VFS, "bad security option: %s\n", value); return 1; } @@ -1093,7 +1096,7 @@ cifs_parse_cache_flavor(char *value, struct smb_vol *vol) vol->strict_io = false; break; default: - cERROR(1, "bad cache= option: %s", value); + cifs_dbg(VFS, "bad cache= option: %s\n", value); return 1; } return 0; @@ -1122,9 +1125,13 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) vol->ops = &smb30_operations; vol->vals = &smb30_values; break; + case Smb_302: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smb302_values; + break; #endif default: - cERROR(1, "Unknown vers= option specified: %s", value); + cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); return 1; } return 0; @@ -1255,20 +1262,24 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, separator[0] = options[4]; options += 5; } else { - cFYI(1, "Null separator not allowed"); + cifs_dbg(FYI, "Null separator not allowed\n"); } } vol->backupuid_specified = false; /* no backup intent for a user */ vol->backupgid_specified = false; /* no backup intent for a group */ - /* - * For now, we ignore -EINVAL errors under the assumption that the - * unc= and prefixpath= options will be usable. - */ - if (cifs_parse_devname(devname, vol) == -ENOMEM) { - printk(KERN_ERR "CIFS: Unable to allocate memory to parse " - "device string.\n"); - goto out_nomem; + switch (cifs_parse_devname(devname, vol)) { + case 0: + break; + case -ENOMEM: + cifs_dbg(VFS, "Unable to allocate memory for devname.\n"); + goto cifs_parse_mount_err; + case -EINVAL: + cifs_dbg(VFS, "Malformed UNC in devname.\n"); + goto cifs_parse_mount_err; + default: + cifs_dbg(VFS, "Unknown error parsing devname.\n"); + goto cifs_parse_mount_err; } while ((data = strsep(&options, separator)) != NULL) { @@ -1423,7 +1434,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->local_lease = 1; break; case Opt_sign: - vol->secFlg |= CIFSSEC_MUST_SIGN; + vol->sign = true; break; case Opt_seal: /* we do not do the following in secFlags because seal @@ -1440,8 +1451,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, break; case Opt_fsc: #ifndef CONFIG_CIFS_FSCACHE - cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE " - "kernel config option set"); + cifs_dbg(VFS, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n"); goto cifs_parse_mount_err; #endif vol->fsc = true; @@ -1455,59 +1465,62 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_sloppy: sloppy = true; break; + case Opt_nosharesock: + vol->nosharesock = true; + break; /* Numeric Values */ case Opt_backupuid: if (get_option_uid(args, &vol->backupuid)) { - cERROR(1, "%s: Invalid backupuid value", - __func__); + cifs_dbg(VFS, "%s: Invalid backupuid value\n", + __func__); goto cifs_parse_mount_err; } vol->backupuid_specified = true; break; case Opt_backupgid: if (get_option_gid(args, &vol->backupgid)) { - cERROR(1, "%s: Invalid backupgid value", - __func__); + cifs_dbg(VFS, "%s: Invalid backupgid value\n", + __func__); goto cifs_parse_mount_err; } vol->backupgid_specified = true; break; case Opt_uid: if (get_option_uid(args, &vol->linux_uid)) { - cERROR(1, "%s: Invalid uid value", - __func__); + cifs_dbg(VFS, "%s: Invalid uid value\n", + __func__); goto cifs_parse_mount_err; } uid_specified = true; break; case Opt_cruid: if (get_option_uid(args, &vol->cred_uid)) { - cERROR(1, "%s: Invalid cruid value", - __func__); + cifs_dbg(VFS, "%s: Invalid cruid value\n", + __func__); goto cifs_parse_mount_err; } break; case Opt_gid: if (get_option_gid(args, &vol->linux_gid)) { - cERROR(1, "%s: Invalid gid value", - __func__); + cifs_dbg(VFS, "%s: Invalid gid value\n", + __func__); goto cifs_parse_mount_err; } gid_specified = true; break; case Opt_file_mode: if (get_option_ul(args, &option)) { - cERROR(1, "%s: Invalid file_mode value", - __func__); + cifs_dbg(VFS, "%s: Invalid file_mode value\n", + __func__); goto cifs_parse_mount_err; } vol->file_mode = option; break; case Opt_dirmode: if (get_option_ul(args, &option)) { - cERROR(1, "%s: Invalid dir_mode value", - __func__); + cifs_dbg(VFS, "%s: Invalid dir_mode value\n", + __func__); goto cifs_parse_mount_err; } vol->dir_mode = option; @@ -1515,37 +1528,37 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_port: if (get_option_ul(args, &option) || option > USHRT_MAX) { - cERROR(1, "%s: Invalid port value", __func__); + cifs_dbg(VFS, "%s: Invalid port value\n", + __func__); goto cifs_parse_mount_err; } port = (unsigned short)option; break; case Opt_rsize: if (get_option_ul(args, &option)) { - cERROR(1, "%s: Invalid rsize value", - __func__); + cifs_dbg(VFS, "%s: Invalid rsize value\n", + __func__); goto cifs_parse_mount_err; } vol->rsize = option; break; case Opt_wsize: if (get_option_ul(args, &option)) { - cERROR(1, "%s: Invalid wsize value", - __func__); + cifs_dbg(VFS, "%s: Invalid wsize value\n", + __func__); goto cifs_parse_mount_err; } vol->wsize = option; break; case Opt_actimeo: if (get_option_ul(args, &option)) { - cERROR(1, "%s: Invalid actimeo value", - __func__); + cifs_dbg(VFS, "%s: Invalid actimeo value\n", + __func__); goto cifs_parse_mount_err; } vol->actimeo = HZ * option; if (vol->actimeo > CIFS_MAX_ACTIMEO) { - cERROR(1, "CIFS: attribute cache" - "timeout too large"); + cifs_dbg(VFS, "attribute cache timeout too large\n"); goto cifs_parse_mount_err; } break; @@ -1568,11 +1581,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } vol->username = kstrdup(string, GFP_KERNEL); - if (!vol->username) { - printk(KERN_WARNING "CIFS: no memory " - "for username\n"); + if (!vol->username) goto cifs_parse_mount_err; - } break; case Opt_blank_pass: /* passwords have to be handled differently @@ -1660,30 +1670,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } got_ip = true; break; - case Opt_unc: - string = vol->UNC; - vol->UNC = match_strdup(args); - if (vol->UNC == NULL) - goto out_nomem; - - convert_delimiter(vol->UNC, '\\'); - if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') { - printk(KERN_ERR "CIFS: UNC Path does not " - "begin with // or \\\\\n"); - goto cifs_parse_mount_err; - } - - /* Compare old unc= option to new one */ - if (!string || strcmp(string, vol->UNC)) - printk(KERN_WARNING "CIFS: the value of the " - "unc= mount option does not match the " - "device string. Using the unc= option " - "for now. In 3.10, that option will " - "be ignored and the contents of the " - "device string will be used " - "instead. (%s != %s)\n", string, - vol->UNC); - break; case Opt_domain: string = match_strdup(args); if (string == NULL) @@ -1701,7 +1687,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, "for domainname\n"); goto cifs_parse_mount_err; } - cFYI(1, "Domain name set"); + cifs_dbg(FYI, "Domain name set\n"); break; case Opt_srcaddr: string = match_strdup(args); @@ -1716,26 +1702,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } break; - case Opt_prefixpath: - /* skip over any leading delimiter */ - if (*args[0].from == '/' || *args[0].from == '\\') - args[0].from++; - - string = vol->prepath; - vol->prepath = match_strdup(args); - if (vol->prepath == NULL) - goto out_nomem; - /* Compare old prefixpath= option to new one */ - if (!string || strcmp(string, vol->prepath)) - printk(KERN_WARNING "CIFS: the value of the " - "prefixpath= mount option does not " - "match the device string. Using the " - "prefixpath= option for now. In 3.10, " - "that option will be ignored and the " - "contents of the device string will be " - "used instead.(%s != %s)\n", string, - vol->prepath); - break; case Opt_iocharset: string = match_strdup(args); if (string == NULL) @@ -1759,7 +1725,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, /* if iocharset not set then load_nls_default * is used by caller */ - cFYI(1, "iocharset set to %s", string); + cifs_dbg(FYI, "iocharset set to %s\n", string); break; case Opt_netbiosname: string = match_strdup(args); @@ -1873,20 +1839,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, #ifndef CONFIG_KEYS /* Muliuser mounts require CONFIG_KEYS support */ if (vol->multiuser) { - cERROR(1, "Multiuser mounts require kernels with " - "CONFIG_KEYS enabled."); + cifs_dbg(VFS, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n"); goto cifs_parse_mount_err; } #endif if (!vol->UNC) { - cERROR(1, "CIFS mount error: No usable UNC path provided in " - "device string or in unc= option!"); + cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string!\n"); goto cifs_parse_mount_err; } /* make sure UNC has a share name */ if (!strchr(vol->UNC + 3, '\\')) { - cERROR(1, "Malformed UNC. Unable to find share name."); + cifs_dbg(VFS, "Malformed UNC. Unable to find share name.\n"); goto cifs_parse_mount_err; } @@ -2027,47 +1991,21 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr, static bool match_security(struct TCP_Server_Info *server, struct smb_vol *vol) { - unsigned int secFlags; - - if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) - secFlags = vol->secFlg; - else - secFlags = global_secflags | vol->secFlg; - - switch (server->secType) { - case LANMAN: - if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT))) - return false; - break; - case NTLMv2: - if (!(secFlags & CIFSSEC_MAY_NTLMV2)) - return false; - break; - case NTLM: - if (!(secFlags & CIFSSEC_MAY_NTLM)) - return false; - break; - case Kerberos: - if (!(secFlags & CIFSSEC_MAY_KRB5)) - return false; - break; - case RawNTLMSSP: - if (!(secFlags & CIFSSEC_MAY_NTLMSSP)) - return false; - break; - default: - /* shouldn't happen */ + /* + * The select_sectype function should either return the vol->sectype + * that was specified, or "Unspecified" if that sectype was not + * compatible with the given NEGOTIATE request. + */ + if (select_sectype(server, vol->sectype) == Unspecified) return false; - } - /* now check if signing mode is acceptable */ - if ((secFlags & CIFSSEC_MAY_SIGN) == 0 && - (server->sec_mode & SECMODE_SIGN_REQUIRED)) - return false; - else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) && - (server->sec_mode & - (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0) - return false; + /* + * Now check if signing mode is acceptable. No need to check + * global_secflags at this point since if MUST_SIGN is set then + * the server->sign had better be too. + */ + if (vol->sign && !server->sign) + return false; return true; } @@ -2076,6 +2014,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) { struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr; + if (vol->nosharesock) + return 0; + if ((server->vals != vol->vals) || (server->ops != vol->ops)) return 0; @@ -2107,7 +2048,7 @@ cifs_find_tcp_session(struct smb_vol *vol) ++server->srv_count; spin_unlock(&cifs_tcp_ses_lock); - cFYI(1, "Existing tcp session with server found"); + cifs_dbg(FYI, "Existing tcp session with server found\n"); return server; } spin_unlock(&cifs_tcp_ses_lock); @@ -2154,7 +2095,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) struct TCP_Server_Info *tcp_ses = NULL; int rc; - cFYI(1, "UNC: %s", volume_info->UNC); + cifs_dbg(FYI, "UNC: %s\n", volume_info->UNC); /* see if we already have a matching tcp_ses */ tcp_ses = cifs_find_tcp_session(volume_info); @@ -2169,7 +2110,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) rc = cifs_crypto_shash_allocate(tcp_ses); if (rc) { - cERROR(1, "could not setup hash structures rc %d", rc); + cifs_dbg(VFS, "could not setup hash structures rc %d\n", rc); goto out_err; } @@ -2216,7 +2157,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) rc = ip_connect(tcp_ses); if (rc < 0) { - cERROR(1, "Error connecting to socket. Aborting operation"); + cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n"); goto out_err_crypto_release; } @@ -2229,7 +2170,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) tcp_ses, "cifsd"); if (IS_ERR(tcp_ses->tsk)) { rc = PTR_ERR(tcp_ses->tsk); - cERROR(1, "error %d create cifsd thread", rc); + cifs_dbg(VFS, "error %d create cifsd thread\n", rc); module_put(THIS_MODULE); goto out_err_crypto_release; } @@ -2265,7 +2206,11 @@ out_err: static int match_session(struct cifs_ses *ses, struct smb_vol *vol) { - switch (ses->server->secType) { + if (vol->sectype != Unspecified && + vol->sectype != ses->sectype) + return 0; + + switch (ses->sectype) { case Kerberos: if (!uid_eq(vol->cred_uid, ses->cred_uid)) return 0; @@ -2316,7 +2261,7 @@ cifs_put_smb_ses(struct cifs_ses *ses) unsigned int xid; struct TCP_Server_Info *server = ses->server; - cFYI(1, "%s: ses_count=%d", __func__, ses->ses_count); + cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); spin_lock(&cifs_tcp_ses_lock); if (--ses->ses_count > 0) { spin_unlock(&cifs_tcp_ses_lock); @@ -2368,23 +2313,24 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr); break; default: - cFYI(1, "Bad ss_family (%hu)", server->dstaddr.ss_family); + cifs_dbg(FYI, "Bad ss_family (%hu)\n", + server->dstaddr.ss_family); rc = -EINVAL; goto out_err; } - cFYI(1, "%s: desc=%s", __func__, desc); + cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc); key = request_key(&key_type_logon, desc, ""); if (IS_ERR(key)) { if (!ses->domainName) { - cFYI(1, "domainName is NULL"); + cifs_dbg(FYI, "domainName is NULL\n"); rc = PTR_ERR(key); goto out_err; } /* didn't work, try to find a domain key */ sprintf(desc, "cifs:d:%s", ses->domainName); - cFYI(1, "%s: desc=%s", __func__, desc); + cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc); key = request_key(&key_type_logon, desc, ""); if (IS_ERR(key)) { rc = PTR_ERR(key); @@ -2402,32 +2348,34 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) /* find first : in payload */ payload = (char *)upayload->data; delim = strnchr(payload, upayload->datalen, ':'); - cFYI(1, "payload=%s", payload); + cifs_dbg(FYI, "payload=%s\n", payload); if (!delim) { - cFYI(1, "Unable to find ':' in payload (datalen=%d)", - upayload->datalen); + cifs_dbg(FYI, "Unable to find ':' in payload (datalen=%d)\n", + upayload->datalen); rc = -EINVAL; goto out_key_put; } len = delim - payload; if (len > MAX_USERNAME_SIZE || len <= 0) { - cFYI(1, "Bad value from username search (len=%zd)", len); + cifs_dbg(FYI, "Bad value from username search (len=%zd)\n", + len); rc = -EINVAL; goto out_key_put; } vol->username = kstrndup(payload, len, GFP_KERNEL); if (!vol->username) { - cFYI(1, "Unable to allocate %zd bytes for username", len); + cifs_dbg(FYI, "Unable to allocate %zd bytes for username\n", + len); rc = -ENOMEM; goto out_key_put; } - cFYI(1, "%s: username=%s", __func__, vol->username); + cifs_dbg(FYI, "%s: username=%s\n", __func__, vol->username); len = key->datalen - (len + 1); if (len > MAX_PASSWORD_SIZE || len <= 0) { - cFYI(1, "Bad len for password search (len=%zd)", len); + cifs_dbg(FYI, "Bad len for password search (len=%zd)\n", len); rc = -EINVAL; kfree(vol->username); vol->username = NULL; @@ -2437,7 +2385,8 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) ++delim; vol->password = kstrndup(delim, len, GFP_KERNEL); if (!vol->password) { - cFYI(1, "Unable to allocate %zd bytes for password", len); + cifs_dbg(FYI, "Unable to allocate %zd bytes for password\n", + len); rc = -ENOMEM; kfree(vol->username); vol->username = NULL; @@ -2449,7 +2398,7 @@ out_key_put: key_put(key); out_err: kfree(desc); - cFYI(1, "%s: returning %d", __func__, rc); + cifs_dbg(FYI, "%s: returning %d\n", __func__, rc); return rc; } #else /* ! CONFIG_KEYS */ @@ -2474,7 +2423,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) ses = cifs_find_smb_ses(server, volume_info); if (ses) { - cFYI(1, "Existing smb sess found (status=%d)", ses->status); + cifs_dbg(FYI, "Existing smb sess found (status=%d)\n", + ses->status); mutex_lock(&ses->session_mutex); rc = cifs_negotiate_protocol(xid, ses); @@ -2486,7 +2436,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) return ERR_PTR(rc); } if (ses->need_reconnect) { - cFYI(1, "Session needs reconnect"); + cifs_dbg(FYI, "Session needs reconnect\n"); rc = cifs_setup_session(xid, ses, volume_info->local_nls); if (rc) { @@ -2505,7 +2455,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) return ses; } - cFYI(1, "Existing smb sess not found"); + cifs_dbg(FYI, "Existing smb sess not found\n"); ses = sesInfoAlloc(); if (ses == NULL) goto get_ses_fail; @@ -2537,7 +2487,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) ses->cred_uid = volume_info->cred_uid; ses->linux_uid = volume_info->linux_uid; - ses->overrideSecFlg = volume_info->secFlg; + ses->sectype = volume_info->sectype; + ses->sign = volume_info->sign; mutex_lock(&ses->session_mutex); rc = cifs_negotiate_protocol(xid, ses); @@ -2595,7 +2546,7 @@ cifs_put_tcon(struct cifs_tcon *tcon) unsigned int xid; struct cifs_ses *ses = tcon->ses; - cFYI(1, "%s: tc_count=%d", __func__, tcon->tc_count); + cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count); spin_lock(&cifs_tcp_ses_lock); if (--tcon->tc_count > 0) { spin_unlock(&cifs_tcp_ses_lock); @@ -2623,12 +2574,11 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon = cifs_find_tcon(ses, volume_info->UNC); if (tcon) { - cFYI(1, "Found match on UNC path"); + cifs_dbg(FYI, "Found match on UNC path\n"); /* existing tcon already has a reference */ cifs_put_smb_ses(ses); if (tcon->seal != volume_info->seal) - cERROR(1, "transport encryption setting " - "conflicts with existing tid"); + cifs_dbg(VFS, "transport encryption setting conflicts with existing tid\n"); return tcon; } @@ -2660,13 +2610,13 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) rc = ses->server->ops->tree_connect(xid, ses, volume_info->UNC, tcon, volume_info->local_nls); free_xid(xid); - cFYI(1, "Tcon rc = %d", rc); + cifs_dbg(FYI, "Tcon rc = %d\n", rc); if (rc) goto out_fail; if (volume_info->nodfs) { tcon->Flags &= ~SMB_SHARE_IS_IN_DFS; - cFYI(1, "DFS disabled (%d)", tcon->Flags); + cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags); } tcon->seal = volume_info->seal; /* @@ -2820,7 +2770,7 @@ get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const char *old_path, strcpy(temp_unc + 2 + strlen(ses->serverName), "\\IPC$"); rc = ses->server->ops->tree_connect(xid, ses, temp_unc, NULL, nls_codepage); - cFYI(1, "Tcon rc = %d ipc_tid = %d", rc, ses->ipc_tid); + cifs_dbg(FYI, "Tcon rc = %d ipc_tid = %d\n", rc, ses->ipc_tid); kfree(temp_unc); } if (rc == 0) @@ -2898,13 +2848,11 @@ bind_socket(struct TCP_Server_Info *server) saddr4 = (struct sockaddr_in *)&server->srcaddr; saddr6 = (struct sockaddr_in6 *)&server->srcaddr; if (saddr6->sin6_family == AF_INET6) - cERROR(1, "cifs: " - "Failed to bind to: %pI6c, error: %d", - &saddr6->sin6_addr, rc); + cifs_dbg(VFS, "Failed to bind to: %pI6c, error: %d\n", + &saddr6->sin6_addr, rc); else - cERROR(1, "cifs: " - "Failed to bind to: %pI4, error: %d", - &saddr4->sin_addr.s_addr, rc); + cifs_dbg(VFS, "Failed to bind to: %pI4, error: %d\n", + &saddr4->sin_addr.s_addr, rc); } } return rc; @@ -3009,13 +2957,13 @@ generic_ip_connect(struct TCP_Server_Info *server) rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM, IPPROTO_TCP, &socket, 1); if (rc < 0) { - cERROR(1, "Error %d creating socket", rc); + cifs_dbg(VFS, "Error %d creating socket\n", rc); server->ssocket = NULL; return rc; } /* BB other socket options to set KEEPALIVE, NODELAY? */ - cFYI(1, "Socket created"); + cifs_dbg(FYI, "Socket created\n"); server->ssocket = socket; socket->sk->sk_allocation = GFP_NOFS; if (sfamily == AF_INET6) @@ -3049,16 +2997,17 @@ generic_ip_connect(struct TCP_Server_Info *server) rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, (char *)&val, sizeof(val)); if (rc) - cFYI(1, "set TCP_NODELAY socket option error %d", rc); + cifs_dbg(FYI, "set TCP_NODELAY socket option error %d\n", + rc); } - cFYI(1, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx", + cifs_dbg(FYI, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx\n", socket->sk->sk_sndbuf, socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); rc = socket->ops->connect(socket, saddr, slen, 0); if (rc < 0) { - cFYI(1, "Error %d connecting to server", rc); + cifs_dbg(FYI, "Error %d connecting to server\n", rc); sock_release(socket); server->ssocket = NULL; return rc; @@ -3116,19 +3065,19 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, if (vol_info && vol_info->no_linux_ext) { tcon->fsUnixInfo.Capability = 0; tcon->unix_ext = 0; /* Unix Extensions disabled */ - cFYI(1, "Linux protocol extensions disabled"); + cifs_dbg(FYI, "Linux protocol extensions disabled\n"); return; } else if (vol_info) tcon->unix_ext = 1; /* Unix Extensions supported */ if (tcon->unix_ext == 0) { - cFYI(1, "Unix extensions disabled so not set on reconnect"); + cifs_dbg(FYI, "Unix extensions disabled so not set on reconnect\n"); return; } if (!CIFSSMBQFSUnixInfo(xid, tcon)) { __u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability); - cFYI(1, "unix caps which server supports %lld", cap); + cifs_dbg(FYI, "unix caps which server supports %lld\n", cap); /* check for reconnect case in which we do not want to change the mount behavior if we can avoid it */ if (vol_info == NULL) { @@ -3138,22 +3087,22 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, cap &= ~CIFS_UNIX_POSIX_ACL_CAP; if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) { if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) - cERROR(1, "POSIXPATH support change"); + cifs_dbg(VFS, "POSIXPATH support change\n"); cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; } else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) { - cERROR(1, "possible reconnect error"); - cERROR(1, "server disabled POSIX path support"); + cifs_dbg(VFS, "possible reconnect error\n"); + cifs_dbg(VFS, "server disabled POSIX path support\n"); } } if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP) - cERROR(1, "per-share encryption not supported yet"); + cifs_dbg(VFS, "per-share encryption not supported yet\n"); cap &= CIFS_UNIX_CAP_MASK; if (vol_info && vol_info->no_psx_acl) cap &= ~CIFS_UNIX_POSIX_ACL_CAP; else if (CIFS_UNIX_POSIX_ACL_CAP & cap) { - cFYI(1, "negotiated posix acl support"); + cifs_dbg(FYI, "negotiated posix acl support\n"); if (cifs_sb) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIXACL; @@ -3162,43 +3111,38 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, if (vol_info && vol_info->posix_paths == 0) cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) { - cFYI(1, "negotiate posix pathnames"); + cifs_dbg(FYI, "negotiate posix pathnames\n"); if (cifs_sb) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; } - cFYI(1, "Negotiate caps 0x%x", (int)cap); + cifs_dbg(FYI, "Negotiate caps 0x%x\n", (int)cap); #ifdef CONFIG_CIFS_DEBUG2 if (cap & CIFS_UNIX_FCNTL_CAP) - cFYI(1, "FCNTL cap"); + cifs_dbg(FYI, "FCNTL cap\n"); if (cap & CIFS_UNIX_EXTATTR_CAP) - cFYI(1, "EXTATTR cap"); + cifs_dbg(FYI, "EXTATTR cap\n"); if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) - cFYI(1, "POSIX path cap"); + cifs_dbg(FYI, "POSIX path cap\n"); if (cap & CIFS_UNIX_XATTR_CAP) - cFYI(1, "XATTR cap"); + cifs_dbg(FYI, "XATTR cap\n"); if (cap & CIFS_UNIX_POSIX_ACL_CAP) - cFYI(1, "POSIX ACL cap"); + cifs_dbg(FYI, "POSIX ACL cap\n"); if (cap & CIFS_UNIX_LARGE_READ_CAP) - cFYI(1, "very large read cap"); + cifs_dbg(FYI, "very large read cap\n"); if (cap & CIFS_UNIX_LARGE_WRITE_CAP) - cFYI(1, "very large write cap"); + cifs_dbg(FYI, "very large write cap\n"); if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP) - cFYI(1, "transport encryption cap"); + cifs_dbg(FYI, "transport encryption cap\n"); if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP) - cFYI(1, "mandatory transport encryption cap"); + cifs_dbg(FYI, "mandatory transport encryption cap\n"); #endif /* CIFS_DEBUG2 */ if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) { if (vol_info == NULL) { - cFYI(1, "resetting capabilities failed"); + cifs_dbg(FYI, "resetting capabilities failed\n"); } else - cERROR(1, "Negotiating Unix capabilities " - "with the server failed. Consider " - "mounting with the Unix Extensions " - "disabled if problems are found " - "by specifying the nounix mount " - "option."); + cifs_dbg(VFS, "Negotiating Unix capabilities with the server failed. Consider mounting with the Unix Extensions disabled if problems are found by specifying the nounix mount option.\n"); } } @@ -3223,8 +3167,8 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_gid = pvolume_info->linux_gid; cifs_sb->mnt_file_mode = pvolume_info->file_mode; cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; - cFYI(1, "file mode: 0x%hx dir mode: 0x%hx", - cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); + cifs_dbg(FYI, "file mode: 0x%hx dir mode: 0x%hx\n", + cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); cifs_sb->actimeo = pvolume_info->actimeo; cifs_sb->local_nls = pvolume_info->local_nls; @@ -3273,21 +3217,19 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, if (pvolume_info->strict_io) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO; if (pvolume_info->direct_io) { - cFYI(1, "mounting share using direct i/o"); + cifs_dbg(FYI, "mounting share using direct i/o\n"); cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; } if (pvolume_info->mfsymlinks) { if (pvolume_info->sfu_emul) { - cERROR(1, "mount option mfsymlinks ignored if sfu " - "mount option is used"); + cifs_dbg(VFS, "mount option mfsymlinks ignored if sfu mount option is used\n"); } else { cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS; } } if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) - cERROR(1, "mount option dynperm ignored if cifsacl " - "mount option supported"); + cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n"); } static void @@ -3332,14 +3274,14 @@ build_unc_path_to_root(const struct smb_vol *vol, pos = full_path + unc_len; if (pplen) { - *pos++ = CIFS_DIR_SEP(cifs_sb); - strncpy(pos, vol->prepath, pplen); + *pos = CIFS_DIR_SEP(cifs_sb); + strncpy(pos + 1, vol->prepath, pplen); pos += pplen; } *pos = '\0'; /* add trailing null */ convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); - cFYI(1, "%s: full_path=%s", __func__, full_path); + cifs_dbg(FYI, "%s: full_path=%s\n", __func__, full_path); return full_path; } @@ -3410,14 +3352,14 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, return -EINVAL; if (volume_info->nullauth) { - cFYI(1, "Anonymous login"); + cifs_dbg(FYI, "Anonymous login\n"); kfree(volume_info->username); volume_info->username = NULL; } else if (volume_info->username) { /* BB fixme parse for domain name here */ - cFYI(1, "Username: %s", volume_info->username); + cifs_dbg(FYI, "Username: %s\n", volume_info->username); } else { - cifserror("No username specified"); + cifs_dbg(VFS, "No username specified\n"); /* In userspace mount helper we can get user name from alternate locations such as env variables and files on disk */ return -EINVAL; @@ -3430,7 +3372,7 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, } else { volume_info->local_nls = load_nls(volume_info->iocharset); if (volume_info->local_nls == NULL) { - cERROR(1, "CIFS mount error: iocharset %s not found", + cifs_dbg(VFS, "CIFS mount error: iocharset %s not found\n", volume_info->iocharset); return -ELIBACC; } @@ -3709,7 +3651,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, NTLMv2 password here) */ #ifdef CONFIG_CIFS_WEAK_PW_HASH if ((global_secflags & CIFSSEC_MAY_LANMAN) && - (ses->server->secType == LANMAN)) + (ses->sectype == LANMAN)) calc_lanman_hash(tcon->password, ses->server->cryptkey, ses->server->sec_mode & SECMODE_PW_ENCRYPT ? true : false, @@ -3727,8 +3669,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, } } - if (ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + if (ses->server->sign) smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; if (ses->capabilities & CAP_STATUS32) { @@ -3780,18 +3721,18 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, if (length == 3) { if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') && (bcc_ptr[2] == 'C')) { - cFYI(1, "IPC connection"); + cifs_dbg(FYI, "IPC connection\n"); tcon->ipc = 1; } } else if (length == 2) { if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) { /* the most common case */ - cFYI(1, "disk share connection"); + cifs_dbg(FYI, "disk share connection\n"); } } bcc_ptr += length + 1; bytes_left -= (length + 1); - strncpy(tcon->treeName, tree, MAX_TREE_SIZE); + strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); /* mostly informational -- no need to fail on error here */ kfree(tcon->nativeFileSystem); @@ -3799,7 +3740,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, bytes_left, is_unicode, nls_codepage); - cFYI(1, "nativeFileSystem=%s", tcon->nativeFileSystem); + cifs_dbg(FYI, "nativeFileSystem=%s\n", tcon->nativeFileSystem); if ((smb_buffer_response->WordCount == 3) || (smb_buffer_response->WordCount == 7)) @@ -3807,7 +3748,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport); else tcon->Flags = 0; - cFYI(1, "Tcon flags: 0x%x ", tcon->Flags); + cifs_dbg(FYI, "Tcon flags: 0x%x\n", tcon->Flags); } else if ((rc == 0) && tcon == NULL) { /* all we need to save for IPC$ connection */ ses->ipc_tid = smb_buffer_response->Tid; @@ -3880,31 +3821,32 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, int rc = -ENOSYS; struct TCP_Server_Info *server = ses->server; - ses->flags = 0; ses->capabilities = server->capabilities; if (linuxExtEnabled == 0) ses->capabilities &= (~server->vals->cap_unix); - cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", + cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n", server->sec_mode, server->capabilities, server->timeAdj); if (server->ops->sess_setup) rc = server->ops->sess_setup(xid, ses, nls_info); if (rc) { - cERROR(1, "Send error in SessSetup = %d", rc); + cifs_dbg(VFS, "Send error in SessSetup = %d\n", rc); } else { - mutex_lock(&ses->server->srv_mutex); + mutex_lock(&server->srv_mutex); if (!server->session_estab) { server->session_key.response = ses->auth_key.response; server->session_key.len = ses->auth_key.len; server->sequence_number = 0x2; server->session_estab = true; ses->auth_key.response = NULL; + if (server->ops->generate_signingkey) + server->ops->generate_signingkey(server); } mutex_unlock(&server->srv_mutex); - cFYI(1, "CIFS Session Established successfully"); + cifs_dbg(FYI, "CIFS Session Established successfully\n"); spin_lock(&GlobalMid_Lock); ses->status = CifsGood; ses->need_reconnect = false; @@ -3923,23 +3865,11 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, static int cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses) { - switch (ses->server->secType) { - case Kerberos: - vol->secFlg = CIFSSEC_MUST_KRB5; + vol->sectype = ses->sectype; + + /* krb5 is special, since we don't need username or pw */ + if (vol->sectype == Kerberos) return 0; - case NTLMv2: - vol->secFlg = CIFSSEC_MUST_NTLMV2; - break; - case NTLM: - vol->secFlg = CIFSSEC_MUST_NTLM; - break; - case RawNTLMSSP: - vol->secFlg = CIFSSEC_MUST_NTLMSSP; - break; - case LANMAN: - vol->secFlg = CIFSSEC_MUST_LANMAN; - break; - } return cifs_set_cifscreds(vol, ses); } @@ -3965,6 +3895,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) vol_info->nocase = master_tcon->nocase; vol_info->local_lease = master_tcon->local_lease; vol_info->no_linux_ext = !master_tcon->unix_ext; + vol_info->sectype = master_tcon->ses->sectype; + vol_info->sign = master_tcon->ses->sign; rc = cifs_set_vol_auth(vol_info, master_tcon->ses); if (rc) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 1cd016217448..5175aebf6737 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -102,7 +102,7 @@ cifs_bp_rename_retry: namelen += (1 + temp->d_name.len); temp = temp->d_parent; if (temp == NULL) { - cERROR(1, "corrupt dentry"); + cifs_dbg(VFS, "corrupt dentry\n"); rcu_read_unlock(); return NULL; } @@ -124,12 +124,12 @@ cifs_bp_rename_retry: full_path[namelen] = dirsep; strncpy(full_path + namelen + 1, temp->d_name.name, temp->d_name.len); - cFYI(0, "name: %s", full_path + namelen); + cifs_dbg(FYI, "name: %s\n", full_path + namelen); } spin_unlock(&temp->d_lock); temp = temp->d_parent; if (temp == NULL) { - cERROR(1, "corrupt dentry"); + cifs_dbg(VFS, "corrupt dentry\n"); rcu_read_unlock(); kfree(full_path); return NULL; @@ -137,8 +137,8 @@ cifs_bp_rename_retry: } rcu_read_unlock(); if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) { - cFYI(1, "did not end path lookup where expected. namelen=%d " - "dfsplen=%d", namelen, dfsplen); + cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n", + namelen, dfsplen); /* presumably this is only possible if racing with a rename of one of the parent directories (we can not lock the dentries above us to prevent this, but retrying should be harmless) */ @@ -178,7 +178,7 @@ check_name(struct dentry *direntry) if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { for (i = 0; i < direntry->d_name.len; i++) { if (direntry->d_name.name[i] == '\\') { - cFYI(1, "Invalid file name"); + cifs_dbg(FYI, "Invalid file name\n"); return -EINVAL; } } @@ -291,7 +291,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, else if ((oflags & O_CREAT) == O_CREAT) disposition = FILE_OPEN_IF; else - cFYI(1, "Create flag not set in create function"); + cifs_dbg(FYI, "Create flag not set in create function\n"); /* * BB add processing to set equivalent of mode - e.g. via CreateX with @@ -323,7 +323,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, desired_access, create_options, fid, oplock, buf, cifs_sb); if (rc) { - cFYI(1, "cifs_create returned 0x%x", rc); + cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc); goto out; } @@ -389,7 +389,8 @@ cifs_create_get_file_info: cifs_create_set_dentry: if (rc != 0) { - cFYI(1, "Create worked, get_inode_info failed rc = %d", rc); + cifs_dbg(FYI, "Create worked, get_inode_info failed rc = %d\n", + rc); if (server->ops->close) server->ops->close(xid, tcon, fid); goto out; @@ -452,12 +453,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, xid = get_xid(); - cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p", - inode, direntry->d_name.name, direntry); + cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n", + inode, direntry->d_name.name, direntry); tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); - if (IS_ERR(tlink)) + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); goto out_free_xid; + } tcon = tlink_tcon(tlink); server = tcon->ses->server; @@ -518,8 +521,8 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, __u32 oplock; int created = FILE_CREATED; - cFYI(1, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p", - inode, direntry->d_name.name, direntry); + cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n", + inode, direntry->d_name.name, direntry); tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); rc = PTR_ERR(tlink); @@ -613,7 +616,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, goto mknod_out; - cFYI(1, "sfu compat create special file"); + cifs_dbg(FYI, "sfu compat create special file\n"); buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); if (buf == NULL) { @@ -688,8 +691,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, xid = get_xid(); - cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p", - parent_dir_inode, direntry->d_name.name, direntry); + cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n", + parent_dir_inode, direntry->d_name.name, direntry); /* check whether path exists */ @@ -715,11 +718,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } if (direntry->d_inode != NULL) { - cFYI(1, "non-NULL inode in lookup"); + cifs_dbg(FYI, "non-NULL inode in lookup\n"); } else { - cFYI(1, "NULL inode in lookup"); + cifs_dbg(FYI, "NULL inode in lookup\n"); } - cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode); + cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", + full_path, direntry->d_inode); if (pTcon->unix_ext) { rc = cifs_get_inode_info_unix(&newInode, full_path, @@ -742,7 +746,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, /* if it was once a directory (but how can we tell?) we could do shrink_dcache_parent(direntry); */ } else if (rc != -EACCES) { - cERROR(1, "Unexpected lookup error %d", rc); + cifs_dbg(VFS, "Unexpected lookup error %d\n", rc); /* We special case check for Access Denied - since that is a common return code */ } @@ -807,7 +811,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) { int rc = 0; - cFYI(1, "In cifs d_delete, name = %s", direntry->d_name.name); + cifs_dbg(FYI, "In cifs d_delete, name = %s\n", direntry->d_name.name); return rc; } */ @@ -818,8 +822,7 @@ const struct dentry_operations cifs_dentry_ops = { /* d_delete: cifs_d_delete, */ /* not needed except for debugging */ }; -static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode, - struct qstr *q) +static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q) { struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls; unsigned long hash; @@ -834,12 +837,10 @@ static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode, return 0; } -static int cifs_ci_compare(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int cifs_ci_compare(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls; + struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls; if ((name->len == len) && (nls_strnicmp(codepage, name->name, str, len) == 0)) diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 1d2d91d9bf65..7ede7306599f 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -34,7 +34,7 @@ /** * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address. - * @unc: UNC path specifying the server + * @unc: UNC path specifying the server (with '/' as delimiter) * @ip_addr: Where to return the IP address. * * The IP address will be returned in string form, and the caller is @@ -55,7 +55,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) len = strlen(unc); if (len < 3) { - cFYI(1, "%s: unc is too short: %s", __func__, unc); + cifs_dbg(FYI, "%s: unc is too short: %s\n", __func__, unc); return -EINVAL; } @@ -64,12 +64,12 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) hostname = unc + 2; /* Search for server name delimiter */ - sep = memchr(hostname, '\\', len); + sep = memchr(hostname, '/', len); if (sep) len = sep - hostname; else - cFYI(1, "%s: probably server name is whole unc: %s", - __func__, unc); + cifs_dbg(FYI, "%s: probably server name is whole unc: %s\n", + __func__, unc); /* Try to interpret hostname as an IPv4 or IPv6 address */ rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len); @@ -79,11 +79,11 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) /* Perform the upcall */ rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL); if (rc < 0) - cFYI(1, "%s: unable to resolve: %*.*s", - __func__, len, len, hostname); + cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n", + __func__, len, len, hostname); else - cFYI(1, "%s: resolved: %*.*s to %s", - __func__, len, len, hostname, *ip_addr); + cifs_dbg(FYI, "%s: resolved: %*.*s to %s\n", + __func__, len, len, hostname, *ip_addr); return rc; name_is_IP_address: @@ -92,7 +92,8 @@ name_is_IP_address: return -ENOMEM; memcpy(name, hostname, len); name[len] = 0; - cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name); + cifs_dbg(FYI, "%s: unc is IP, skipping dns upcall: %s\n", + __func__, name); *ip_addr = name; return 0; } diff --git a/fs/cifs/export.c b/fs/cifs/export.c index 9c7ecdccf2f3..ce8b7f677c58 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -49,7 +49,7 @@ static struct dentry *cifs_get_parent(struct dentry *dentry) { /* BB need to add code here eventually to enable export via NFSD */ - cFYI(1, "get parent for %p", dentry); + cifs_dbg(FYI, "get parent for %p\n", dentry); return ERR_PTR(-EACCES); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7a0dd99e4507..91d8629e69a2 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -78,9 +78,8 @@ static u32 cifs_posix_convert_flags(unsigned int flags) if (flags & O_EXCL) posix_flags |= SMB_O_EXCL; } else if (flags & O_EXCL) - cFYI(1, "Application %s pid %d has incorrectly set O_EXCL flag" - "but not O_CREAT on file open. Ignoring O_EXCL", - current->comm, current->tgid); + cifs_dbg(FYI, "Application %s pid %d has incorrectly set O_EXCL flag but not O_CREAT on file open. Ignoring O_EXCL\n", + current->comm, current->tgid); if (flags & O_TRUNC) posix_flags |= SMB_O_TRUNC; @@ -123,7 +122,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode, struct tcon_link *tlink; struct cifs_tcon *tcon; - cFYI(1, "posix open %s", full_path); + cifs_dbg(FYI, "posix open %s\n", full_path); presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); if (presp_data == NULL) @@ -308,7 +307,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, */ if (oplock == server->vals->oplock_read && cifs_has_mand_locks(cinode)) { - cFYI(1, "Reset oplock val from read to None due to mand locks"); + cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n"); oplock = 0; } @@ -374,8 +373,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) list_del(&cifs_file->tlist); if (list_empty(&cifsi->openFileList)) { - cFYI(1, "closing last open instance for inode %p", - cifs_file->dentry->d_inode); + cifs_dbg(FYI, "closing last open instance for inode %p\n", + cifs_file->dentry->d_inode); /* * In strict cache mode we need invalidate mapping on the last * close because it may cause a error when we open this file @@ -454,7 +453,7 @@ int cifs_open(struct inode *inode, struct file *file) goto out; } - cFYI(1, "inode = 0x%p file flags are 0x%x for %s", + cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n", inode, file->f_flags, full_path); if (server->oplocks) @@ -470,16 +469,13 @@ int cifs_open(struct inode *inode, struct file *file) cifs_sb->mnt_file_mode /* ignored */, file->f_flags, &oplock, &fid.netfid, xid); if (rc == 0) { - cFYI(1, "posix open succeeded"); + cifs_dbg(FYI, "posix open succeeded\n"); posix_open_ok = true; } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { if (tcon->ses->serverNOS) - cERROR(1, "server %s of type %s returned" - " unexpected error on SMB posix open" - ", disabling posix open support." - " Check if server update available.", - tcon->ses->serverName, - tcon->ses->serverNOS); + cifs_dbg(VFS, "server %s of type %s returned unexpected error on SMB posix open, disabling posix open support. Check if server update available.\n", + tcon->ses->serverName, + tcon->ses->serverNOS); tcon->broken_posix_open = true; } else if ((rc != -EIO) && (rc != -EREMOTE) && (rc != -EOPNOTSUPP)) /* path not found or net err */ @@ -621,8 +617,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) return rc; } - cFYI(1, "inode = 0x%p file flags 0x%x for %s", inode, cfile->f_flags, - full_path); + cifs_dbg(FYI, "inode = 0x%p file flags 0x%x for %s\n", + inode, cfile->f_flags, full_path); if (tcon->ses->server->oplocks) oplock = REQ_OPLOCK; @@ -643,7 +639,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) cifs_sb->mnt_file_mode /* ignored */, oflags, &oplock, &fid.netfid, xid); if (rc == 0) { - cFYI(1, "posix reopen succeeded"); + cifs_dbg(FYI, "posix reopen succeeded\n"); goto reopen_success; } /* @@ -672,8 +668,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) NULL, cifs_sb); if (rc) { mutex_unlock(&cfile->fh_mutex); - cFYI(1, "cifs_reopen returned 0x%x", rc); - cFYI(1, "oplock: %d", oplock); + cifs_dbg(FYI, "cifs_reopen returned 0x%x\n", rc); + cifs_dbg(FYI, "oplock: %d\n", oplock); goto reopen_error_exit; } @@ -729,7 +725,7 @@ int cifs_closedir(struct inode *inode, struct file *file) struct TCP_Server_Info *server; char *buf; - cFYI(1, "Closedir inode = 0x%p", inode); + cifs_dbg(FYI, "Closedir inode = 0x%p\n", inode); if (cfile == NULL) return rc; @@ -738,7 +734,7 @@ int cifs_closedir(struct inode *inode, struct file *file) tcon = tlink_tcon(cfile->tlink); server = tcon->ses->server; - cFYI(1, "Freeing private data in close dir"); + cifs_dbg(FYI, "Freeing private data in close dir\n"); spin_lock(&cifs_file_list_lock); if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { cfile->invalidHandle = true; @@ -747,7 +743,7 @@ int cifs_closedir(struct inode *inode, struct file *file) rc = server->ops->close_dir(xid, tcon, &cfile->fid); else rc = -ENOSYS; - cFYI(1, "Closing uncompleted readdir with rc %d", rc); + cifs_dbg(FYI, "Closing uncompleted readdir with rc %d\n", rc); /* not much we can do if it fails anyway, ignore rc */ rc = 0; } else @@ -755,7 +751,7 @@ int cifs_closedir(struct inode *inode, struct file *file) buf = cfile->srch_inf.ntwrk_buf_start; if (buf) { - cFYI(1, "closedir free smb buf in srch struct"); + cifs_dbg(FYI, "closedir free smb buf in srch struct\n"); cfile->srch_inf.ntwrk_buf_start = NULL; if (cfile->srch_inf.smallBuf) cifs_small_buf_release(buf); @@ -1003,7 +999,7 @@ try_again: rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next); if (!rc) goto try_again; - locks_delete_block(flock); + posix_unblock_lock(flock); } return rc; } @@ -1096,6 +1092,7 @@ struct lock_to_push { static int cifs_push_posix_locks(struct cifsFileInfo *cfile) { + struct inode *inode = cfile->dentry->d_inode; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct file_lock *flock, **before; unsigned int count = 0, i = 0; @@ -1106,12 +1103,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) xid = get_xid(); - lock_flocks(); - cifs_for_each_lock(cfile->dentry->d_inode, before) { + spin_lock(&inode->i_lock); + cifs_for_each_lock(inode, before) { if ((*before)->fl_flags & FL_POSIX) count++; } - unlock_flocks(); + spin_unlock(&inode->i_lock); INIT_LIST_HEAD(&locks_to_send); @@ -1130,8 +1127,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) } el = locks_to_send.next; - lock_flocks(); - cifs_for_each_lock(cfile->dentry->d_inode, before) { + spin_lock(&inode->i_lock); + cifs_for_each_lock(inode, before) { flock = *before; if ((flock->fl_flags & FL_POSIX) == 0) continue; @@ -1140,7 +1137,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) * The list ended. We don't have enough allocated * structures - something is really wrong. */ - cERROR(1, "Can't push all brlocks!"); + cifs_dbg(VFS, "Can't push all brlocks!\n"); break; } length = 1 + flock->fl_end - flock->fl_start; @@ -1156,7 +1153,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) lck->offset = flock->fl_start; el = el->next; } - unlock_flocks(); + spin_unlock(&inode->i_lock); list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { int stored_rc; @@ -1213,47 +1210,46 @@ cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock, bool *wait_flag, struct TCP_Server_Info *server) { if (flock->fl_flags & FL_POSIX) - cFYI(1, "Posix"); + cifs_dbg(FYI, "Posix\n"); if (flock->fl_flags & FL_FLOCK) - cFYI(1, "Flock"); + cifs_dbg(FYI, "Flock\n"); if (flock->fl_flags & FL_SLEEP) { - cFYI(1, "Blocking lock"); + cifs_dbg(FYI, "Blocking lock\n"); *wait_flag = true; } if (flock->fl_flags & FL_ACCESS) - cFYI(1, "Process suspended by mandatory locking - " - "not implemented yet"); + cifs_dbg(FYI, "Process suspended by mandatory locking - not implemented yet\n"); if (flock->fl_flags & FL_LEASE) - cFYI(1, "Lease on file - not implemented yet"); + cifs_dbg(FYI, "Lease on file - not implemented yet\n"); if (flock->fl_flags & (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE | FL_CLOSE))) - cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags); + cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags); *type = server->vals->large_lock_type; if (flock->fl_type == F_WRLCK) { - cFYI(1, "F_WRLCK "); + cifs_dbg(FYI, "F_WRLCK\n"); *type |= server->vals->exclusive_lock_type; *lock = 1; } else if (flock->fl_type == F_UNLCK) { - cFYI(1, "F_UNLCK"); + cifs_dbg(FYI, "F_UNLCK\n"); *type |= server->vals->unlock_lock_type; *unlock = 1; /* Check if unlock includes more than one lock range */ } else if (flock->fl_type == F_RDLCK) { - cFYI(1, "F_RDLCK"); + cifs_dbg(FYI, "F_RDLCK\n"); *type |= server->vals->shared_lock_type; *lock = 1; } else if (flock->fl_type == F_EXLCK) { - cFYI(1, "F_EXLCK"); + cifs_dbg(FYI, "F_EXLCK\n"); *type |= server->vals->exclusive_lock_type; *lock = 1; } else if (flock->fl_type == F_SHLCK) { - cFYI(1, "F_SHLCK"); + cifs_dbg(FYI, "F_SHLCK\n"); *type |= server->vals->shared_lock_type; *lock = 1; } else - cFYI(1, "Unknown type of lock"); + cifs_dbg(FYI, "Unknown type of lock\n"); } static int @@ -1296,8 +1292,8 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, type, 0, 1, false); flock->fl_type = F_UNLCK; if (rc != 0) - cERROR(1, "Error unlocking previously locked " - "range %d during test of lock", rc); + cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n", + rc); return 0; } @@ -1316,8 +1312,8 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, type | server->vals->shared_lock_type, 0, 1, false); flock->fl_type = F_RDLCK; if (rc != 0) - cERROR(1, "Error unlocking previously locked " - "range %d during test of lock", rc); + cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n", + rc); } else flock->fl_type = F_WRLCK; @@ -1508,8 +1504,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, if (!CIFS_I(inode)->clientCanCacheAll && CIFS_I(inode)->clientCanCacheRead) { cifs_invalidate_mapping(inode); - cFYI(1, "Set no oplock for inode=%p due to mand locks", - inode); + cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n", + inode); CIFS_I(inode)->clientCanCacheRead = false; } @@ -1546,9 +1542,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) rc = -EACCES; xid = get_xid(); - cFYI(1, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld " - "end: %lld", cmd, flock->fl_flags, flock->fl_type, - flock->fl_start, flock->fl_end); + cifs_dbg(FYI, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld end: %lld\n", + cmd, flock->fl_flags, flock->fl_type, + flock->fl_start, flock->fl_end); cfile = (struct cifsFileInfo *)file->private_data; tcon = tlink_tcon(cfile->tlink); @@ -1620,8 +1616,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, cifs_sb = CIFS_SB(dentry->d_sb); - cFYI(1, "write %zd bytes to offset %lld of %s", write_size, - *offset, dentry->d_name.name); + cifs_dbg(FYI, "write %zd bytes to offset %lld of %s\n", + write_size, *offset, dentry->d_name.name); tcon = tlink_tcon(open_file->tlink); server = tcon->ses->server; @@ -1736,7 +1732,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, it being zero) during stress testcases so we need to check for it */ if (cifs_inode == NULL) { - cERROR(1, "Null inode passed to cifs_writeable_file"); + cifs_dbg(VFS, "Null inode passed to cifs_writeable_file\n"); dump_stack(); return NULL; } @@ -1848,7 +1844,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) else if (bytes_written < 0) rc = bytes_written; } else { - cFYI(1, "No writeable filehandles for inode"); + cifs_dbg(FYI, "No writeable filehandles for inode\n"); rc = -EIO; } @@ -2015,7 +2011,7 @@ retry: wdata->cfile = find_writable_file(CIFS_I(mapping->host), false); if (!wdata->cfile) { - cERROR(1, "No writable handles for inode"); + cifs_dbg(VFS, "No writable handles for inode\n"); rc = -EBADF; break; } @@ -2076,7 +2072,7 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc) /* BB add check for wbc flags */ page_cache_get(page); if (!PageUptodate(page)) - cFYI(1, "ppw - page not up to date"); + cifs_dbg(FYI, "ppw - page not up to date\n"); /* * Set the "writeback" flag, and clear "dirty" in the radix tree. @@ -2127,7 +2123,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, else pid = current->tgid; - cFYI(1, "write_end for page %p from pos %lld with %d bytes", + cifs_dbg(FYI, "write_end for page %p from pos %lld with %d bytes\n", page, pos, copied); if (PageChecked(page)) { @@ -2191,13 +2187,13 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, xid = get_xid(); - cFYI(1, "Sync file - name: %s datasync: 0x%x", - file->f_path.dentry->d_name.name, datasync); + cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n", + file->f_path.dentry->d_name.name, datasync); if (!CIFS_I(inode)->clientCanCacheRead) { rc = cifs_invalidate_mapping(inode); if (rc) { - cFYI(1, "rc: %d during invalidate phase", rc); + cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc); rc = 0; /* don't care about it in fsync */ } } @@ -2233,8 +2229,8 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) xid = get_xid(); - cFYI(1, "Sync file - name: %s datasync: 0x%x", - file->f_path.dentry->d_name.name, datasync); + cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n", + file->f_path.dentry->d_name.name, datasync); tcon = tlink_tcon(smbfile->tlink); if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { @@ -2262,7 +2258,7 @@ int cifs_flush(struct file *file, fl_owner_t id) if (file->f_mode & FMODE_WRITE) rc = filemap_write_and_wait(inode->i_mapping); - cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc); + cifs_dbg(FYI, "Flush inode %p file %p rc %d\n", inode, file, rc); return rc; } @@ -2520,8 +2516,6 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); - sb_start_write(inode->i_sb); - /* * We need to hold the sem to be sure nobody modifies lock list * with a brlock that prevents writing. @@ -2545,7 +2539,6 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, } up_read(&cinode->lock_sem); - sb_end_write(inode->i_sb); return rc; } @@ -2582,8 +2575,8 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, * an old data. */ cifs_invalidate_mapping(inode); - cFYI(1, "Set no oplock for inode=%p after a write operation", - inode); + cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n", + inode); cinode->clientCanCacheRead = false; } return written; @@ -2759,15 +2752,15 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server, /* enough data to fill the page */ iov.iov_base = kmap(page); iov.iov_len = PAGE_SIZE; - cFYI(1, "%u: iov_base=%p iov_len=%zu", - i, iov.iov_base, iov.iov_len); + cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n", + i, iov.iov_base, iov.iov_len); len -= PAGE_SIZE; } else if (len > 0) { /* enough for partial page, fill and zero the rest */ iov.iov_base = kmap(page); iov.iov_len = len; - cFYI(1, "%u: iov_base=%p iov_len=%zu", - i, iov.iov_base, iov.iov_len); + cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n", + i, iov.iov_base, iov.iov_len); memset(iov.iov_base + len, '\0', PAGE_SIZE - len); rdata->tailsz = len; len = 0; @@ -2827,7 +2820,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, pid = current->tgid; if ((file->f_flags & O_ACCMODE) == O_WRONLY) - cFYI(1, "attempting read on write only file instance"); + cifs_dbg(FYI, "attempting read on write only file instance\n"); do { cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); @@ -3006,7 +2999,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) pid = current->tgid; if ((file->f_flags & O_ACCMODE) == O_WRONLY) - cFYI(1, "attempting read on write only file instance"); + cifs_dbg(FYI, "attempting read on write only file instance\n"); for (total_read = 0, cur_offset = read_data; read_size > total_read; total_read += bytes_read, cur_offset += bytes_read) { @@ -3097,7 +3090,8 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) xid = get_xid(); rc = cifs_revalidate_file(file); if (rc) { - cFYI(1, "Validation prior to mmap failed, error=%d", rc); + cifs_dbg(FYI, "Validation prior to mmap failed, error=%d\n", + rc); free_xid(xid); return rc; } @@ -3150,7 +3144,7 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server, /* determine the eof that the server (probably) has */ eof = CIFS_I(rdata->mapping->host)->server_eof; eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; - cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index); + cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index); rdata->tailsz = PAGE_CACHE_SIZE; for (i = 0; i < nr_pages; i++) { @@ -3160,15 +3154,15 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server, /* enough data to fill the page */ iov.iov_base = kmap(page); iov.iov_len = PAGE_CACHE_SIZE; - cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", - i, page->index, iov.iov_base, iov.iov_len); + cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n", + i, page->index, iov.iov_base, iov.iov_len); len -= PAGE_CACHE_SIZE; } else if (len > 0) { /* enough for partial page, fill and zero the rest */ iov.iov_base = kmap(page); iov.iov_len = len; - cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", - i, page->index, iov.iov_base, iov.iov_len); + cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n", + i, page->index, iov.iov_base, iov.iov_len); memset(iov.iov_base + len, '\0', PAGE_CACHE_SIZE - len); rdata->tailsz = len; @@ -3248,8 +3242,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rc = 0; INIT_LIST_HEAD(&tmplist); - cFYI(1, "%s: file=%p mapping=%p num_pages=%u", __func__, file, - mapping, num_pages); + cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n", + __func__, file, mapping, num_pages); /* * Start with the page at end of list and move it to private @@ -3379,7 +3373,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page, if (rc < 0) goto io_error; else - cFYI(1, "Bytes read %d", rc); + cifs_dbg(FYI, "Bytes read %d\n", rc); file_inode(file)->i_atime = current_fs_time(file_inode(file)->i_sb); @@ -3417,7 +3411,7 @@ static int cifs_readpage(struct file *file, struct page *page) return rc; } - cFYI(1, "readpage %p at offset %d 0x%x", + cifs_dbg(FYI, "readpage %p at offset %d 0x%x\n", page, (int)offset, (int)offset); rc = cifs_readpage_worker(file, page, &offset); @@ -3484,7 +3478,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping, struct page *page; int rc = 0; - cFYI(1, "write_begin from %lld len %d", (long long)pos, len); + cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len); page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { @@ -3553,11 +3547,12 @@ static int cifs_release_page(struct page *page, gfp_t gfp) return cifs_fscache_release_page(page, gfp); } -static void cifs_invalidate_page(struct page *page, unsigned long offset) +static void cifs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length) { struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); } @@ -3573,7 +3568,7 @@ static int cifs_launder_page(struct page *page) .range_end = range_end, }; - cFYI(1, "Launder page: %p", page); + cifs_dbg(FYI, "Launder page: %p\n", page); if (clear_page_dirty_for_io(page)) rc = cifs_writepage_locked(page, &wbc); @@ -3593,8 +3588,8 @@ void cifs_oplock_break(struct work_struct *work) if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead && cifs_has_mand_locks(cinode)) { - cFYI(1, "Reset oplock to None for inode=%p due to mand locks", - inode); + cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", + inode); cinode->clientCanCacheRead = false; } @@ -3609,12 +3604,12 @@ void cifs_oplock_break(struct work_struct *work) mapping_set_error(inode->i_mapping, rc); cifs_invalidate_mapping(inode); } - cFYI(1, "Oplock flush inode %p rc %d", inode, rc); + cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc); } rc = cifs_push_locks(cfile); if (rc) - cERROR(1, "Push locks rc = %d", rc); + cifs_dbg(VFS, "Push locks rc = %d\n", rc); /* * releasing stale oplock after recent reconnect of smb session using @@ -3625,7 +3620,7 @@ void cifs_oplock_break(struct work_struct *work) if (!cfile->oplock_break_cancelled) { rc = tcon->ses->server->ops->oplock_response(tcon, &cfile->fid, cinode); - cFYI(1, "Oplock release rc = %d", rc); + cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } } diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 42e5363b4102..2f4bc5a58054 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -28,14 +28,14 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) server->fscache = fscache_acquire_cookie(cifs_fscache_netfs.primary_index, &cifs_fscache_server_index_def, server); - cFYI(1, "%s: (0x%p/0x%p)", __func__, server, - server->fscache); + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", + __func__, server, server->fscache); } void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) { - cFYI(1, "%s: (0x%p/0x%p)", __func__, server, - server->fscache); + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", + __func__, server, server->fscache); fscache_relinquish_cookie(server->fscache, 0); server->fscache = NULL; } @@ -47,13 +47,13 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) tcon->fscache = fscache_acquire_cookie(server->fscache, &cifs_fscache_super_index_def, tcon); - cFYI(1, "%s: (0x%p/0x%p)", __func__, server->fscache, - tcon->fscache); + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", + __func__, server->fscache, tcon->fscache); } void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) { - cFYI(1, "%s: (0x%p)", __func__, tcon->fscache); + cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache); fscache_relinquish_cookie(tcon->fscache, 0); tcon->fscache = NULL; } @@ -70,8 +70,8 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) { cifsi->fscache = fscache_acquire_cookie(tcon->fscache, &cifs_fscache_inode_object_def, cifsi); - cFYI(1, "%s: got FH cookie (0x%p/0x%p)", __func__, - tcon->fscache, cifsi->fscache); + cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n", + __func__, tcon->fscache, cifsi->fscache); } } @@ -80,7 +80,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode) struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { - cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache); + cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); fscache_relinquish_cookie(cifsi->fscache, 0); cifsi->fscache = NULL; } @@ -91,7 +91,7 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode) struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { - cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache); + cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); fscache_uncache_all_inode_pages(cifsi->fscache, inode); fscache_relinquish_cookie(cifsi->fscache, 1); cifsi->fscache = NULL; @@ -120,8 +120,8 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode) cifs_sb_master_tcon(cifs_sb)->fscache, &cifs_fscache_inode_object_def, cifsi); - cFYI(1, "%s: new cookie 0x%p oldcookie 0x%p", - __func__, cifsi->fscache, old); + cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n", + __func__, cifsi->fscache, old); } } @@ -131,8 +131,8 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp) struct inode *inode = page->mapping->host; struct cifsInodeInfo *cifsi = CIFS_I(inode); - cFYI(1, "%s: (0x%p/0x%p)", __func__, page, - cifsi->fscache); + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", + __func__, page, cifsi->fscache); if (!fscache_maybe_release_page(cifsi->fscache, page, gfp)) return 0; } @@ -143,7 +143,7 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp) static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx, int error) { - cFYI(1, "%s: (0x%p/%d)", __func__, page, error); + cifs_dbg(FYI, "%s: (0x%p/%d)\n", __func__, page, error); if (!error) SetPageUptodate(page); unlock_page(page); @@ -156,8 +156,8 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) { int ret; - cFYI(1, "%s: (fsc:%p, p:%p, i:0x%p", __func__, - CIFS_I(inode)->fscache, page, inode); + cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n", + __func__, CIFS_I(inode)->fscache, page, inode); ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page, cifs_readpage_from_fscache_complete, NULL, @@ -165,15 +165,15 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) switch (ret) { case 0: /* page found in fscache, read submitted */ - cFYI(1, "%s: submitted", __func__); + cifs_dbg(FYI, "%s: submitted\n", __func__); return ret; case -ENOBUFS: /* page won't be cached */ case -ENODATA: /* page not in cache */ - cFYI(1, "%s: %d", __func__, ret); + cifs_dbg(FYI, "%s: %d\n", __func__, ret); return 1; default: - cERROR(1, "unknown error ret = %d", ret); + cifs_dbg(VFS, "unknown error ret = %d\n", ret); } return ret; } @@ -188,8 +188,8 @@ int __cifs_readpages_from_fscache(struct inode *inode, { int ret; - cFYI(1, "%s: (0x%p/%u/0x%p)", __func__, - CIFS_I(inode)->fscache, *nr_pages, inode); + cifs_dbg(FYI, "%s: (0x%p/%u/0x%p)\n", + __func__, CIFS_I(inode)->fscache, *nr_pages, inode); ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping, pages, nr_pages, cifs_readpage_from_fscache_complete, @@ -197,16 +197,16 @@ int __cifs_readpages_from_fscache(struct inode *inode, mapping_gfp_mask(mapping)); switch (ret) { case 0: /* read submitted to the cache for all pages */ - cFYI(1, "%s: submitted", __func__); + cifs_dbg(FYI, "%s: submitted\n", __func__); return ret; case -ENOBUFS: /* some pages are not cached and can't be */ case -ENODATA: /* some pages are not cached */ - cFYI(1, "%s: no page", __func__); + cifs_dbg(FYI, "%s: no page\n", __func__); return 1; default: - cFYI(1, "unknown error ret = %d", ret); + cifs_dbg(FYI, "unknown error ret = %d\n", ret); } return ret; @@ -216,8 +216,8 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) { int ret; - cFYI(1, "%s: (fsc: %p, p: %p, i: %p)", __func__, - CIFS_I(inode)->fscache, page, inode); + cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n", + __func__, CIFS_I(inode)->fscache, page, inode); ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL); if (ret != 0) fscache_uncache_page(CIFS_I(inode)->fscache, page); @@ -228,7 +228,7 @@ void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) struct cifsInodeInfo *cifsi = CIFS_I(inode); struct fscache_cookie *cookie = cifsi->fscache; - cFYI(1, "%s: (0x%p/0x%p)", __func__, page, cookie); + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie); fscache_wait_on_page_write(cookie, page); fscache_uncache_page(cookie, page); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 20887bf63121..20efd81266c6 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -91,30 +91,32 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); - cFYI(1, "%s: revalidating inode %llu", __func__, cifs_i->uniqueid); + cifs_dbg(FYI, "%s: revalidating inode %llu\n", + __func__, cifs_i->uniqueid); if (inode->i_state & I_NEW) { - cFYI(1, "%s: inode %llu is new", __func__, cifs_i->uniqueid); + cifs_dbg(FYI, "%s: inode %llu is new\n", + __func__, cifs_i->uniqueid); return; } /* don't bother with revalidation if we have an oplock */ if (cifs_i->clientCanCacheRead) { - cFYI(1, "%s: inode %llu is oplocked", __func__, - cifs_i->uniqueid); + cifs_dbg(FYI, "%s: inode %llu is oplocked\n", + __func__, cifs_i->uniqueid); return; } /* revalidate if mtime or size have changed */ if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) && cifs_i->server_eof == fattr->cf_eof) { - cFYI(1, "%s: inode %llu is unchanged", __func__, - cifs_i->uniqueid); + cifs_dbg(FYI, "%s: inode %llu is unchanged\n", + __func__, cifs_i->uniqueid); return; } - cFYI(1, "%s: invalidating inode %llu mapping", __func__, - cifs_i->uniqueid); + cifs_dbg(FYI, "%s: invalidating inode %llu mapping\n", + __func__, cifs_i->uniqueid); cifs_i->invalid_mapping = true; } @@ -169,7 +171,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) inode->i_flags |= S_AUTOMOUNT; - cifs_set_ops(inode); + if (inode->i_state & I_NEW) + cifs_set_ops(inode); } void @@ -240,7 +243,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, /* safest to call it a file if we do not know */ fattr->cf_mode |= S_IFREG; fattr->cf_dtype = DT_REG; - cFYI(1, "unknown type %d", le32_to_cpu(info->Type)); + cifs_dbg(FYI, "unknown type %d\n", le32_to_cpu(info->Type)); break; } @@ -279,7 +282,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - cFYI(1, "creating fake fattr for DFS referral"); + cifs_dbg(FYI, "creating fake fattr for DFS referral\n"); memset(fattr, 0, sizeof(*fattr)); fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU; @@ -329,7 +332,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, struct tcon_link *tlink; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - cFYI(1, "Getting info on %s", full_path); + cifs_dbg(FYI, "Getting info on %s\n", full_path); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -355,7 +358,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); if (tmprc) - cFYI(1, "CIFSCheckMFSymlink: %d", tmprc); + cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); } if (*pinode == NULL) { @@ -422,7 +425,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path, &buf_type); if ((rc == 0) && (bytes_read >= 8)) { if (memcmp("IntxBLK", pbuf, 8) == 0) { - cFYI(1, "Block device"); + cifs_dbg(FYI, "Block device\n"); fattr->cf_mode |= S_IFBLK; fattr->cf_dtype = DT_BLK; if (bytes_read == 24) { @@ -434,7 +437,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path, fattr->cf_rdev = MKDEV(mjr, mnr); } } else if (memcmp("IntxCHR", pbuf, 8) == 0) { - cFYI(1, "Char device"); + cifs_dbg(FYI, "Char device\n"); fattr->cf_mode |= S_IFCHR; fattr->cf_dtype = DT_CHR; if (bytes_read == 24) { @@ -446,7 +449,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path, fattr->cf_rdev = MKDEV(mjr, mnr); } } else if (memcmp("IntxLNK", pbuf, 7) == 0) { - cFYI(1, "Symlink"); + cifs_dbg(FYI, "Symlink\n"); fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; } else { @@ -497,10 +500,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, else if (rc > 3) { mode = le32_to_cpu(*((__le32 *)ea_value)); fattr->cf_mode &= ~SFBITS_MASK; - cFYI(1, "special bits 0%o org mode 0%o", mode, - fattr->cf_mode); + cifs_dbg(FYI, "special bits 0%o org mode 0%o\n", + mode, fattr->cf_mode); fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode; - cFYI(1, "special mode bits 0%o", mode); + cifs_dbg(FYI, "special mode bits 0%o\n", mode); } return 0; @@ -635,11 +638,11 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, tcon = tlink_tcon(tlink); server = tcon->ses->server; - cFYI(1, "Getting info on %s", full_path); + cifs_dbg(FYI, "Getting info on %s\n", full_path); if ((data == NULL) && (*inode != NULL)) { if (CIFS_I(*inode)->clientCanCacheRead) { - cFYI(1, "No need to revalidate cached inode sizes"); + cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); goto cgii_exit; } } @@ -714,7 +717,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, tcon, cifs_sb, full_path, &fattr.cf_uniqueid, data); if (tmprc) { - cFYI(1, "GetSrvInodeNum rc %d", tmprc); + cifs_dbg(FYI, "GetSrvInodeNum rc %d\n", + tmprc); fattr.cf_uniqueid = iunique(sb, ROOT_I); cifs_autodisable_serverino(cifs_sb); } @@ -729,7 +733,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid); if (tmprc) - cFYI(1, "cifs_sfu_type failed: %d", tmprc); + cifs_dbg(FYI, "cifs_sfu_type failed: %d\n", tmprc); } #ifdef CONFIG_CIFS_ACL @@ -737,8 +741,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, full_path, fid); if (rc) { - cFYI(1, "%s: Getting ACL failed with error: %d", - __func__, rc); + cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n", + __func__, rc); goto cgii_exit; } } @@ -752,7 +756,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); if (tmprc) - cFYI(1, "CIFSCheckMFSymlink: %d", tmprc); + cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); } if (!*inode) { @@ -836,7 +840,7 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr) struct inode *inode; retry_iget5_locked: - cFYI(1, "looking for uniqueid=%llu", fattr->cf_uniqueid); + cifs_dbg(FYI, "looking for uniqueid=%llu\n", fattr->cf_uniqueid); /* hash down to 32-bits on 32-bit arch */ hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid); @@ -899,7 +903,7 @@ struct inode *cifs_root_iget(struct super_block *sb) #endif if (rc && tcon->ipc) { - cFYI(1, "ipc connection - fake read inode"); + cifs_dbg(FYI, "ipc connection - fake read inode\n"); spin_lock(&inode->i_lock); inode->i_mode |= S_IFDIR; set_nlink(inode, 2); @@ -958,7 +962,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid, * server times. */ if (set_time && (attrs->ia_valid & ATTR_CTIME)) { - cFYI(1, "CIFS - CTIME changed"); + cifs_dbg(FYI, "CIFS - CTIME changed\n"); info_buf.ChangeTime = cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime)); } else @@ -1127,7 +1131,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) struct iattr *attrs = NULL; __u32 dosattr = 0, origattr = 0; - cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry); + cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -1150,7 +1154,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) rc = CIFSPOSIXDelFile(xid, tcon, full_path, SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - cFYI(1, "posix del rc %d", rc); + cifs_dbg(FYI, "posix del rc %d\n", rc); if ((rc == 0) || (rc == -ENOENT)) goto psx_del_no_retry; } @@ -1320,7 +1324,7 @@ cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode, if (rc == -EOPNOTSUPP) goto posix_mkdir_out; else if (rc) { - cFYI(1, "posix mkdir returned 0x%x", rc); + cifs_dbg(FYI, "posix mkdir returned 0x%x\n", rc); d_drop(dentry); goto posix_mkdir_out; } @@ -1342,11 +1346,12 @@ cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode, d_instantiate(dentry, newinode); #ifdef CONFIG_CIFS_DEBUG2 - cFYI(1, "instantiated dentry %p %s to inode %p", dentry, - dentry->d_name.name, newinode); + cifs_dbg(FYI, "instantiated dentry %p %s to inode %p\n", + dentry, dentry->d_name.name, newinode); if (newinode->i_nlink != 2) - cFYI(1, "unexpected number of links %d", newinode->i_nlink); + cifs_dbg(FYI, "unexpected number of links %d\n", + newinode->i_nlink); #endif posix_mkdir_out: @@ -1368,7 +1373,8 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) struct TCP_Server_Info *server; char *full_path; - cFYI(1, "In cifs_mkdir, mode = 0x%hx inode = 0x%p", mode, inode); + cifs_dbg(FYI, "In cifs_mkdir, mode = 0x%hx inode = 0x%p\n", + mode, inode); cifs_sb = CIFS_SB(inode->i_sb); tlink = cifs_sb_tlink(cifs_sb); @@ -1402,7 +1408,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) /* BB add setting the equivalent of mode via CreateX w/ACLs */ rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb); if (rc) { - cFYI(1, "cifs_mkdir returned 0x%x", rc); + cifs_dbg(FYI, "cifs_mkdir returned 0x%x\n", rc); d_drop(direntry); goto mkdir_out; } @@ -1432,7 +1438,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) char *full_path = NULL; struct cifsInodeInfo *cifsInode; - cFYI(1, "cifs_rmdir, inode = 0x%p", inode); + cifs_dbg(FYI, "cifs_rmdir, inode = 0x%p\n", inode); xid = get_xid(); @@ -1681,8 +1687,8 @@ cifs_invalidate_mapping(struct inode *inode) if (inode->i_mapping && inode->i_mapping->nrpages != 0) { rc = invalidate_inode_pages2(inode->i_mapping); if (rc) { - cERROR(1, "%s: could not invalidate inode %p", __func__, - inode); + cifs_dbg(VFS, "%s: could not invalidate inode %p\n", + __func__, inode); cifs_i->invalid_mapping = true; } } @@ -1732,8 +1738,8 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) goto out; } - cFYI(1, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time " - "%ld jiffies %ld", full_path, inode, inode->i_count.counter, + cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", + full_path, inode, inode->i_count.counter, dentry, dentry->d_time, jiffies); if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) @@ -1883,7 +1889,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, else rc = -ENOSYS; cifsFileInfo_put(open_file); - cFYI(1, "SetFSize for attrs rc = %d", rc); + cifs_dbg(FYI, "SetFSize for attrs rc = %d\n", rc); if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { unsigned int bytes_written; @@ -1894,7 +1900,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, io_parms.length = attrs->ia_size; rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, NULL, NULL, 1); - cFYI(1, "Wrt seteof rc %d", rc); + cifs_dbg(FYI, "Wrt seteof rc %d\n", rc); } } else rc = -EINVAL; @@ -1920,7 +1926,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, attrs->ia_size, cifs_sb, false); else rc = -ENOSYS; - cFYI(1, "SetEOF by path (setattrs) rc = %d", rc); + cifs_dbg(FYI, "SetEOF by path (setattrs) rc = %d\n", rc); if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { __u16 netfid; int oplock = 0; @@ -1940,7 +1946,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, io_parms.length = attrs->ia_size; rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, NULL, NULL, 1); - cFYI(1, "wrt seteof rc %d", rc); + cifs_dbg(FYI, "wrt seteof rc %d\n", rc); CIFSSMBClose(xid, tcon, netfid); } } @@ -1971,7 +1977,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) struct cifs_unix_set_info_args *args = NULL; struct cifsFileInfo *open_file; - cFYI(1, "setattr_unix on file %s attrs->ia_valid=0x%x", + cifs_dbg(FYI, "setattr_unix on file %s attrs->ia_valid=0x%x\n", direntry->d_name.name, attrs->ia_valid); xid = get_xid(); @@ -2114,7 +2120,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) xid = get_xid(); - cFYI(1, "setattr on file %s attrs->iavalid 0x%x", + cifs_dbg(FYI, "setattr on file %s attrs->iavalid 0x%x\n", direntry->d_name.name, attrs->ia_valid); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) @@ -2166,8 +2172,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64, uid, gid); if (rc) { - cFYI(1, "%s: Setting id failed with error: %d", - __func__, rc); + cifs_dbg(FYI, "%s: Setting id failed with error: %d\n", + __func__, rc); goto cifs_setattr_exit; } } @@ -2188,8 +2194,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) rc = id_mode_to_cifs_acl(inode, full_path, mode, INVALID_UID, INVALID_GID); if (rc) { - cFYI(1, "%s: Setting ACL failed with error: %d", - __func__, rc); + cifs_dbg(FYI, "%s: Setting ACL failed with error: %d\n", + __func__, rc); goto cifs_setattr_exit; } } else @@ -2277,7 +2283,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs) #if 0 void cifs_delete_inode(struct inode *inode) { - cFYI(1, "In cifs_delete_inode, inode = 0x%p", inode); + cifs_dbg(FYI, "In cifs_delete_inode, inode = 0x%p\n", inode); /* may have to add back in if and when safe distributed caching of directories added e.g. via FindNotify */ } diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 6c9f1214cf0b..3e0845585853 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -44,7 +44,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) xid = get_xid(); - cFYI(1, "ioctl file %p cmd %u arg %lu", filep, command, arg); + cifs_dbg(FYI, "ioctl file %p cmd %u arg %lu\n", filep, command, arg); cifs_sb = CIFS_SB(inode->i_sb); @@ -83,11 +83,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) * &ExtAttrMask); */ } - cFYI(1, "set flags not implemented yet"); + cifs_dbg(FYI, "set flags not implemented yet\n"); break; #endif /* CONFIG_CIFS_POSIX */ default: - cFYI(1, "unsupported ioctl"); + cifs_dbg(FYI, "unsupported ioctl\n"); break; } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 9f6c4c45d21e..b83c3f5646bd 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -56,14 +56,14 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) md5 = crypto_alloc_shash("md5", 0, 0); if (IS_ERR(md5)) { rc = PTR_ERR(md5); - cERROR(1, "%s: Crypto md5 allocation error %d", __func__, rc); + cifs_dbg(VFS, "%s: Crypto md5 allocation error %d\n", + __func__, rc); return rc; } size = sizeof(struct shash_desc) + crypto_shash_descsize(md5); sdescmd5 = kmalloc(size, GFP_KERNEL); if (!sdescmd5) { rc = -ENOMEM; - cERROR(1, "%s: Memory allocation failure", __func__); goto symlink_hash_err; } sdescmd5->shash.tfm = md5; @@ -71,17 +71,17 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) rc = crypto_shash_init(&sdescmd5->shash); if (rc) { - cERROR(1, "%s: Could not init md5 shash", __func__); + cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__); goto symlink_hash_err; } rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len); if (rc) { - cERROR(1, "%s: Could not update with link_str", __func__); + cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__); goto symlink_hash_err; } rc = crypto_shash_final(&sdescmd5->shash, md5_hash); if (rc) - cERROR(1, "%s: Could not generate md5 hash", __func__); + cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); symlink_hash_err: crypto_free_shash(md5); @@ -115,7 +115,7 @@ CIFSParseMFSymlink(const u8 *buf, rc = symlink_hash(link_len, link_str, md5_hash); if (rc) { - cFYI(1, "%s: MD5 hash failure: %d", __func__, rc); + cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); return rc; } @@ -154,7 +154,7 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) rc = symlink_hash(link_len, link_str, md5_hash); if (rc) { - cFYI(1, "%s: MD5 hash failure: %d", __func__, rc); + cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); return rc; } @@ -521,7 +521,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) if (!full_path) goto out; - cFYI(1, "Full path: %s inode = 0x%p", full_path, inode); + cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode); rc = -EACCES; /* @@ -578,8 +578,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) goto symlink_exit; } - cFYI(1, "Full path: %s", full_path); - cFYI(1, "symname is %s", symname); + cifs_dbg(FYI, "Full path: %s\n", full_path); + cifs_dbg(FYI, "symname is %s\n", symname); /* BB what if DFS and this volume is on different share? BB */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) @@ -601,8 +601,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) inode->i_sb, xid, NULL); if (rc != 0) { - cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d", - rc); + cifs_dbg(FYI, "Create symlink ok, getinodeinfo fail rc = %d\n", + rc); } else { d_instantiate(direntry, newinode); } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 1b15bf839f37..f7d4b2285efe 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -54,7 +54,7 @@ _get_xid(void) if (GlobalTotalActiveXid > GlobalMaxActiveXid) GlobalMaxActiveXid = GlobalTotalActiveXid; if (GlobalTotalActiveXid > 65000) - cFYI(1, "warning: more than 65000 requests active"); + cifs_dbg(FYI, "warning: more than 65000 requests active\n"); xid = GlobalCurrentXid++; spin_unlock(&GlobalMid_Lock); return xid; @@ -91,7 +91,7 @@ void sesInfoFree(struct cifs_ses *buf_to_free) { if (buf_to_free == NULL) { - cFYI(1, "Null buffer passed to sesInfoFree"); + cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n"); return; } @@ -130,7 +130,7 @@ void tconInfoFree(struct cifs_tcon *buf_to_free) { if (buf_to_free == NULL) { - cFYI(1, "Null buffer passed to tconInfoFree"); + cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n"); return; } atomic_dec(&tconInfoAllocCount); @@ -180,7 +180,7 @@ void cifs_buf_release(void *buf_to_free) { if (buf_to_free == NULL) { - /* cFYI(1, "Null buffer passed to cifs_buf_release");*/ + /* cifs_dbg(FYI, "Null buffer passed to cifs_buf_release\n");*/ return; } mempool_free(buf_to_free, cifs_req_poolp); @@ -216,7 +216,7 @@ cifs_small_buf_release(void *buf_to_free) { if (buf_to_free == NULL) { - cFYI(1, "Null buffer passed to cifs_small_buf_release"); + cifs_dbg(FYI, "Null buffer passed to cifs_small_buf_release\n"); return; } mempool_free(buf_to_free, cifs_sm_req_poolp); @@ -267,8 +267,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , if (treeCon->nocase) buffer->Flags |= SMBFLG_CASELESS; if ((treeCon->ses) && (treeCon->ses->server)) - if (treeCon->ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + if (treeCon->ses->server->sign) buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; } @@ -282,15 +281,15 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid) { /* does it have the right SMB "signature" ? */ if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) { - cERROR(1, "Bad protocol string signature header 0x%x", - *(unsigned int *)smb->Protocol); + cifs_dbg(VFS, "Bad protocol string signature header 0x%x\n", + *(unsigned int *)smb->Protocol); return 1; } /* Make sure that message ids match */ if (mid != smb->Mid) { - cERROR(1, "Mids do not match. received=%u expected=%u", - smb->Mid, mid); + cifs_dbg(VFS, "Mids do not match. received=%u expected=%u\n", + smb->Mid, mid); return 1; } @@ -302,7 +301,7 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid) if (smb->Command == SMB_COM_LOCKING_ANDX) return 0; - cERROR(1, "Server sent request, not response. mid=%u", smb->Mid); + cifs_dbg(VFS, "Server sent request, not response. mid=%u\n", smb->Mid); return 1; } @@ -313,8 +312,8 @@ checkSMB(char *buf, unsigned int total_read) __u16 mid = smb->Mid; __u32 rfclen = be32_to_cpu(smb->smb_buf_length); __u32 clc_len; /* calculated length */ - cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", - total_read, rfclen); + cifs_dbg(FYI, "checkSMB Length: 0x%x, smb_buf_length: 0x%x\n", + total_read, rfclen); /* is this frame too small to even get to a BCC? */ if (total_read < 2 + sizeof(struct smb_hdr)) { @@ -340,9 +339,9 @@ checkSMB(char *buf, unsigned int total_read) tmp[sizeof(struct smb_hdr)+1] = 0; return 0; } - cERROR(1, "rcvd invalid byte count (bcc)"); + cifs_dbg(VFS, "rcvd invalid byte count (bcc)\n"); } else { - cERROR(1, "Length less than smb header size"); + cifs_dbg(VFS, "Length less than smb header size\n"); } return -EIO; } @@ -353,8 +352,8 @@ checkSMB(char *buf, unsigned int total_read) clc_len = smbCalcSize(smb); if (4 + rfclen != total_read) { - cERROR(1, "Length read does not match RFC1001 length %d", - rfclen); + cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n", + rfclen); return -EIO; } @@ -365,12 +364,12 @@ checkSMB(char *buf, unsigned int total_read) if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF)) return 0; /* bcc wrapped */ } - cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u", - clc_len, 4 + rfclen, smb->Mid); + cifs_dbg(FYI, "Calculated size %u vs length %u mismatch for mid=%u\n", + clc_len, 4 + rfclen, smb->Mid); if (4 + rfclen < clc_len) { - cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u", - rfclen, smb->Mid); + cifs_dbg(VFS, "RFC1001 size %u smaller than SMB for mid=%u\n", + rfclen, smb->Mid); return -EIO; } else if (rfclen > clc_len + 512) { /* @@ -382,8 +381,8 @@ checkSMB(char *buf, unsigned int total_read) * trailing data, we choose limit the amount of extra * data to 512 bytes. */ - cERROR(1, "RFC1001 size %u more than 512 bytes larger " - "than SMB for mid=%u", rfclen, smb->Mid); + cifs_dbg(VFS, "RFC1001 size %u more than 512 bytes larger than SMB for mid=%u\n", + rfclen, smb->Mid); return -EIO; } } @@ -401,7 +400,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) struct cifsInodeInfo *pCifsInode; struct cifsFileInfo *netfile; - cFYI(1, "Checking for oplock break or dnotify response"); + cifs_dbg(FYI, "Checking for oplock break or dnotify response\n"); if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && (pSMB->hdr.Flags & SMBFLG_RESPONSE)) { struct smb_com_transaction_change_notify_rsp *pSMBr = @@ -413,15 +412,15 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) pnotify = (struct file_notify_information *) ((char *)&pSMBr->hdr.Protocol + data_offset); - cFYI(1, "dnotify on %s Action: 0x%x", + cifs_dbg(FYI, "dnotify on %s Action: 0x%x\n", pnotify->FileName, pnotify->Action); /* cifs_dump_mem("Rcvd notify Data: ",buf, sizeof(struct smb_hdr)+60); */ return true; } if (pSMBr->hdr.Status.CifsError) { - cFYI(1, "notify err 0x%d", - pSMBr->hdr.Status.CifsError); + cifs_dbg(FYI, "notify err 0x%d\n", + pSMBr->hdr.Status.CifsError); return true; } return false; @@ -435,7 +434,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) large dirty files cached on the client */ if ((NT_STATUS_INVALID_HANDLE) == le32_to_cpu(pSMB->hdr.Status.CifsError)) { - cFYI(1, "invalid handle on oplock break"); + cifs_dbg(FYI, "invalid handle on oplock break\n"); return true; } else if (ERRbadfid == le16_to_cpu(pSMB->hdr.Status.DosError.Error)) { @@ -447,7 +446,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) if (pSMB->hdr.WordCount != 8) return false; - cFYI(1, "oplock type 0x%d level 0x%d", + cifs_dbg(FYI, "oplock type 0x%d level 0x%d\n", pSMB->LockType, pSMB->OplockLevel); if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)) return false; @@ -469,7 +468,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) if (pSMB->Fid != netfile->fid.netfid) continue; - cFYI(1, "file id match, oplock break"); + cifs_dbg(FYI, "file id match, oplock break\n"); pCifsInode = CIFS_I(netfile->dentry->d_inode); cifs_set_oplock_level(pCifsInode, @@ -484,12 +483,12 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) } spin_unlock(&cifs_file_list_lock); spin_unlock(&cifs_tcp_ses_lock); - cFYI(1, "No matching file for oplock break"); + cifs_dbg(FYI, "No matching file for oplock break\n"); return true; } } spin_unlock(&cifs_tcp_ses_lock); - cFYI(1, "Can not process oplock break for non-existent connection"); + cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n"); return true; } @@ -536,12 +535,8 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; - cERROR(1, "Autodisabling the use of server inode numbers on " - "%s. This server doesn't seem to support them " - "properly. Hardlinks will not be recognized on this " - "mount. Consider mounting with the \"noserverino\" " - "option to silence this message.", - cifs_sb_master_tcon(cifs_sb)->treeName); + cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s. This server doesn't seem to support them properly. Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n", + cifs_sb_master_tcon(cifs_sb)->treeName); } } @@ -552,13 +547,13 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) if (oplock == OPLOCK_EXCLUSIVE) { cinode->clientCanCacheAll = true; cinode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", - &cinode->vfs_inode); + cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", + &cinode->vfs_inode); } else if (oplock == OPLOCK_READ) { cinode->clientCanCacheAll = false; cinode->clientCanCacheRead = true; - cFYI(1, "Level II Oplock granted on inode %p", - &cinode->vfs_inode); + cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", + &cinode->vfs_inode); } else { cinode->clientCanCacheAll = false; cinode->clientCanCacheRead = false; diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index c0b25b28be6c..af847e1cf1c1 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -150,8 +150,8 @@ cifs_inet_pton(const int address_family, const char *cp, int len, void *dst) else if (address_family == AF_INET6) ret = in6_pton(cp, len, dst , '\\', NULL); - cFYI(DBG2, "address conversion returned %d for %*.*s", - ret, len, len, cp); + cifs_dbg(NOISY, "address conversion returned %d for %*.*s\n", + ret, len, len, cp); if (ret > 0) ret = 1; return ret; @@ -887,7 +887,7 @@ map_smb_to_linux_error(char *buf, bool logErr) } /* else ERRHRD class errors or junk - return EIO */ - cFYI(1, "Mapping smb error code 0x%x to POSIX err %d", + cifs_dbg(FYI, "Mapping smb error code 0x%x to POSIX err %d\n", le32_to_cpu(smb->Status.CifsError), rc); /* generic corrective action e.g. reconnect SMB session on @@ -951,20 +951,20 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) SMB_TIME *st = (SMB_TIME *)&time; SMB_DATE *sd = (SMB_DATE *)&date; - cFYI(1, "date %d time %d", date, time); + cifs_dbg(FYI, "date %d time %d\n", date, time); sec = 2 * st->TwoSeconds; min = st->Minutes; if ((sec > 59) || (min > 59)) - cERROR(1, "illegal time min %d sec %d", min, sec); + cifs_dbg(VFS, "illegal time min %d sec %d\n", min, sec); sec += (min * 60); sec += 60 * 60 * st->Hours; if (st->Hours > 24) - cERROR(1, "illegal hours %d", st->Hours); + cifs_dbg(VFS, "illegal hours %d\n", st->Hours); days = sd->Day; month = sd->Month; if ((days > 31) || (month > 12)) { - cERROR(1, "illegal date, month %d day: %d", month, days); + cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, days); if (month > 12) month = 12; } @@ -990,7 +990,7 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) ts.tv_sec = sec + offset; - /* cFYI(1, "sec after cnvrt dos to unix time %d",sec); */ + /* cifs_dbg(FYI, "sec after cnvrt dos to unix time %d\n",sec); */ ts.tv_nsec = 0; return ts; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index df40cc5fd13a..ab8778469394 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -48,15 +48,15 @@ static void dump_cifs_file_struct(struct file *file, char *label) if (file) { cf = file->private_data; if (cf == NULL) { - cFYI(1, "empty cifs private file data"); + cifs_dbg(FYI, "empty cifs private file data\n"); return; } if (cf->invalidHandle) - cFYI(1, "invalid handle"); + cifs_dbg(FYI, "invalid handle\n"); if (cf->srch_inf.endOfSearch) - cFYI(1, "end of search"); + cifs_dbg(FYI, "end of search\n"); if (cf->srch_inf.emptyDir) - cFYI(1, "empty dir"); + cifs_dbg(FYI, "empty dir\n"); } } #else @@ -80,7 +80,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, struct super_block *sb = parent->d_inode->i_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - cFYI(1, "%s: for %s", __func__, name->name); + cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); dentry = d_hash_and_lookup(parent, name); if (unlikely(IS_ERR(dentry))) @@ -126,6 +126,22 @@ out: dput(dentry); } +/* + * Is it possible that this directory might turn out to be a DFS referral + * once we go to try and use it? + */ +static bool +cifs_dfs_is_possible(struct cifs_sb_info *cifs_sb) +{ +#ifdef CONFIG_CIFS_DFS_UPCALL + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + + if (tcon->Flags & SMB_SHARE_IS_IN_DFS) + return true; +#endif + return false; +} + static void cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) { @@ -135,6 +151,19 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; fattr->cf_dtype = DT_DIR; + /* + * Windows CIFS servers generally make DFS referrals look + * like directories in FIND_* responses with the reparse + * attribute flag also set (since DFS junctions are + * reparse points). We must revalidate at least these + * directory inodes before trying to use them (if + * they are DFS we will get PATH_NOT_COVERED back + * when queried directly and can then try to connect + * to the DFS target) + */ + if (cifs_dfs_is_possible(cifs_sb) && + (fattr->cf_cifsattrs & ATTR_REPARSE)) + fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; } else { fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; fattr->cf_dtype = DT_REG; @@ -233,7 +262,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb, fid, cifs_sb->local_nls); if (CIFSSMBClose(xid, ptcon, fid)) { - cFYI(1, "Error closing temporary reparsepoint open"); + cifs_dbg(FYI, "Error closing temporary reparsepoint open\n"); } } } @@ -285,7 +314,7 @@ initiate_cifs_search(const unsigned int xid, struct file *file) goto error_exit; } - cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos); + cifs_dbg(FYI, "Full path: %s start at: %lld\n", full_path, file->f_pos); ffirst_retry: /* test for Unix extensions */ @@ -336,7 +365,7 @@ static int cifs_unicode_bytelen(const char *str) if (ustr[len] == 0) return len << 1; } - cFYI(1, "Unicode string longer than PATH_MAX found"); + cifs_dbg(FYI, "Unicode string longer than PATH_MAX found\n"); return len << 1; } @@ -353,18 +382,18 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) pfData->FileNameLength; } else new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset); - cFYI(1, "new entry %p old entry %p", new_entry, old_entry); + cifs_dbg(FYI, "new entry %p old entry %p\n", new_entry, old_entry); /* validate that new_entry is not past end of SMB */ if (new_entry >= end_of_smb) { - cERROR(1, "search entry %p began after end of SMB %p old entry %p", - new_entry, end_of_smb, old_entry); + cifs_dbg(VFS, "search entry %p began after end of SMB %p old entry %p\n", + new_entry, end_of_smb, old_entry); return NULL; } else if (((level == SMB_FIND_FILE_INFO_STANDARD) && (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) || ((level != SMB_FIND_FILE_INFO_STANDARD) && (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) { - cERROR(1, "search entry %p extends after end of SMB %p", - new_entry, end_of_smb); + cifs_dbg(VFS, "search entry %p extends after end of SMB %p\n", + new_entry, end_of_smb); return NULL; } else return new_entry; @@ -457,7 +486,7 @@ static int cifs_fill_dirent(struct cifs_dirent *de, const void *info, cifs_fill_dirent_std(de, info); break; default: - cFYI(1, "Unknown findfirst level %d", level); + cifs_dbg(FYI, "Unknown findfirst level %d\n", level); return -EINVAL; } @@ -537,14 +566,14 @@ static int cifs_save_resume_key(const char *current_entry, * every entry (do not increment for . or .. entry). */ static int -find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, +find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, struct file *file, char **current_entry, int *num_to_ret) { __u16 search_flags; int rc = 0; int pos_in_buf = 0; loff_t first_entry_in_buffer; - loff_t index_to_find = file->f_pos; + loff_t index_to_find = pos; struct cifsFileInfo *cfile = file->private_data; struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); struct TCP_Server_Info *server = tcon->ses->server; @@ -572,7 +601,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, if (((index_to_find < cfile->srch_inf.index_of_last_entry) && is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) { /* close and restart search */ - cFYI(1, "search backing up - close and restart search"); + cifs_dbg(FYI, "search backing up - close and restart search\n"); spin_lock(&cifs_file_list_lock); if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { cfile->invalidHandle = true; @@ -582,7 +611,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, } else spin_unlock(&cifs_file_list_lock); if (cfile->srch_inf.ntwrk_buf_start) { - cFYI(1, "freeing SMB ff cache buf on search rewind"); + cifs_dbg(FYI, "freeing SMB ff cache buf on search rewind\n"); if (cfile->srch_inf.smallBuf) cifs_small_buf_release(cfile->srch_inf. ntwrk_buf_start); @@ -593,7 +622,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, } rc = initiate_cifs_search(xid, file); if (rc) { - cFYI(1, "error %d reinitiating a search on rewind", + cifs_dbg(FYI, "error %d reinitiating a search on rewind\n", rc); return rc; } @@ -608,7 +637,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, while ((index_to_find >= cfile->srch_inf.index_of_last_entry) && (rc == 0) && !cfile->srch_inf.endOfSearch) { - cFYI(1, "calling findnext2"); + cifs_dbg(FYI, "calling findnext2\n"); rc = server->ops->query_dir_next(xid, tcon, &cfile->fid, search_flags, &cfile->srch_inf); @@ -631,7 +660,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, first_entry_in_buffer = cfile->srch_inf.index_of_last_entry - cfile->srch_inf.entries_in_buffer; pos_in_buf = index_to_find - first_entry_in_buffer; - cFYI(1, "found entry - pos_in_buf %d", pos_in_buf); + cifs_dbg(FYI, "found entry - pos_in_buf %d\n", pos_in_buf); for (i = 0; (i < (pos_in_buf)) && (cur_ent != NULL); i++) { /* go entry by entry figuring out which is first */ @@ -640,19 +669,18 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, } if ((cur_ent == NULL) && (i < pos_in_buf)) { /* BB fixme - check if we should flag this error */ - cERROR(1, "reached end of buf searching for pos in buf" - " %d index to find %lld rc %d", pos_in_buf, - index_to_find, rc); + cifs_dbg(VFS, "reached end of buf searching for pos in buf %d index to find %lld rc %d\n", + pos_in_buf, index_to_find, rc); } rc = 0; *current_entry = cur_ent; } else { - cFYI(1, "index not in buffer - could not findnext into it"); + cifs_dbg(FYI, "index not in buffer - could not findnext into it\n"); return 0; } if (pos_in_buf >= cfile->srch_inf.entries_in_buffer) { - cFYI(1, "can not return entries pos_in_buf beyond last"); + cifs_dbg(FYI, "can not return entries pos_in_buf beyond last\n"); *num_to_ret = 0; } else *num_to_ret = cfile->srch_inf.entries_in_buffer - pos_in_buf; @@ -660,8 +688,9 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, - void *dirent, char *scratch_buf, unsigned int max_len) +static int cifs_filldir(char *find_entry, struct file *file, + struct dir_context *ctx, + char *scratch_buf, unsigned int max_len) { struct cifsFileInfo *file_info = file->private_data; struct super_block *sb = file->f_path.dentry->d_sb; @@ -678,8 +707,8 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, return rc; if (de.namelen > max_len) { - cERROR(1, "bad search response length %zd past smb end", - de.namelen); + cifs_dbg(VFS, "bad search response length %zd past smb end\n", + de.namelen); return -EINVAL; } @@ -741,13 +770,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, cifs_prime_dcache(file->f_dentry, &name, &fattr); ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); - rc = filldir(dirent, name.name, name.len, file->f_pos, ino, - fattr.cf_dtype); - return rc; + return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype); } -int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) +int cifs_readdir(struct file *file, struct dir_context *ctx) { int rc = 0; unsigned int xid; @@ -768,108 +795,91 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) */ if (file->private_data == NULL) { rc = initiate_cifs_search(xid, file); - cFYI(1, "initiate cifs search rc %d", rc); + cifs_dbg(FYI, "initiate cifs search rc %d\n", rc); if (rc) goto rddir2_exit; } - switch ((int) file->f_pos) { - case 0: - if (filldir(direntry, ".", 1, file->f_pos, - file_inode(file)->i_ino, DT_DIR) < 0) { - cERROR(1, "Filldir for current dir failed"); - rc = -ENOMEM; - break; - } - file->f_pos++; - case 1: - if (filldir(direntry, "..", 2, file->f_pos, - parent_ino(file->f_path.dentry), DT_DIR) < 0) { - cERROR(1, "Filldir for parent dir failed"); - rc = -ENOMEM; - break; - } - file->f_pos++; - default: - /* 1) If search is active, - is in current search buffer? - if it before then restart search - if after then keep searching till find it */ - - if (file->private_data == NULL) { - rc = -EINVAL; - free_xid(xid); - return rc; - } - cifsFile = file->private_data; - if (cifsFile->srch_inf.endOfSearch) { - if (cifsFile->srch_inf.emptyDir) { - cFYI(1, "End of search, empty dir"); - rc = 0; - break; - } - } /* else { - cifsFile->invalidHandle = true; - tcon->ses->server->close(xid, tcon, &cifsFile->fid); - } */ + if (!dir_emit_dots(file, ctx)) + goto rddir2_exit; - tcon = tlink_tcon(cifsFile->tlink); - rc = find_cifs_entry(xid, tcon, file, ¤t_entry, - &num_to_fill); - if (rc) { - cFYI(1, "fce error %d", rc); - goto rddir2_exit; - } else if (current_entry != NULL) { - cFYI(1, "entry %lld found", file->f_pos); - } else { - cFYI(1, "could not find entry"); + /* 1) If search is active, + is in current search buffer? + if it before then restart search + if after then keep searching till find it */ + + if (file->private_data == NULL) { + rc = -EINVAL; + goto rddir2_exit; + } + cifsFile = file->private_data; + if (cifsFile->srch_inf.endOfSearch) { + if (cifsFile->srch_inf.emptyDir) { + cifs_dbg(FYI, "End of search, empty dir\n"); + rc = 0; goto rddir2_exit; } - cFYI(1, "loop through %d times filling dir for net buf %p", - num_to_fill, cifsFile->srch_inf.ntwrk_buf_start); - max_len = tcon->ses->server->ops->calc_smb_size( - cifsFile->srch_inf.ntwrk_buf_start); - end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; - - tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); - if (tmp_buf == NULL) { - rc = -ENOMEM; + } /* else { + cifsFile->invalidHandle = true; + tcon->ses->server->close(xid, tcon, &cifsFile->fid); + } */ + + tcon = tlink_tcon(cifsFile->tlink); + rc = find_cifs_entry(xid, tcon, ctx->pos, file, ¤t_entry, + &num_to_fill); + if (rc) { + cifs_dbg(FYI, "fce error %d\n", rc); + goto rddir2_exit; + } else if (current_entry != NULL) { + cifs_dbg(FYI, "entry %lld found\n", ctx->pos); + } else { + cifs_dbg(FYI, "could not find entry\n"); + goto rddir2_exit; + } + cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n", + num_to_fill, cifsFile->srch_inf.ntwrk_buf_start); + max_len = tcon->ses->server->ops->calc_smb_size( + cifsFile->srch_inf.ntwrk_buf_start); + end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; + + tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); + if (tmp_buf == NULL) { + rc = -ENOMEM; + goto rddir2_exit; + } + + for (i = 0; i < num_to_fill; i++) { + if (current_entry == NULL) { + /* evaluate whether this case is an error */ + cifs_dbg(VFS, "past SMB end, num to fill %d i %d\n", + num_to_fill, i); break; } - - for (i = 0; (i < num_to_fill) && (rc == 0); i++) { - if (current_entry == NULL) { - /* evaluate whether this case is an error */ - cERROR(1, "past SMB end, num to fill %d i %d", - num_to_fill, i); - break; - } - /* - * if buggy server returns . and .. late do we want to - * check for that here? - */ - rc = cifs_filldir(current_entry, file, filldir, - direntry, tmp_buf, max_len); - if (rc == -EOVERFLOW) { + /* + * if buggy server returns . and .. late do we want to + * check for that here? + */ + rc = cifs_filldir(current_entry, file, ctx, + tmp_buf, max_len); + if (rc) { + if (rc > 0) rc = 0; - break; - } - - file->f_pos++; - if (file->f_pos == - cifsFile->srch_inf.index_of_last_entry) { - cFYI(1, "last entry in buf at pos %lld %s", - file->f_pos, tmp_buf); - cifs_save_resume_key(current_entry, cifsFile); - break; - } else - current_entry = - nxt_dir_entry(current_entry, end_of_smb, - cifsFile->srch_inf.info_level); + break; } - kfree(tmp_buf); - break; - } /* end switch */ + + ctx->pos++; + if (ctx->pos == + cifsFile->srch_inf.index_of_last_entry) { + cifs_dbg(FYI, "last entry in buf at pos %lld %s\n", + ctx->pos, tmp_buf); + cifs_save_resume_key(current_entry, cifsFile); + break; + } else + current_entry = + nxt_dir_entry(current_entry, end_of_smb, + cifsFile->srch_inf.info_level); + } + kfree(tmp_buf); rddir2_exit: free_xid(xid); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 76809f4d3428..79358e341fd2 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -138,8 +138,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS | CAP_LARGE_WRITE_X | CAP_LARGE_READ_X; - if (ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + if (ses->server->sign) pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; if (ses->capabilities & CAP_UNICODE) { @@ -283,11 +282,11 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses, int len; char *data = *pbcc_area; - cFYI(1, "bleft %d", bleft); + cifs_dbg(FYI, "bleft %d\n", bleft); kfree(ses->serverOS); ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp); - cFYI(1, "serverOS=%s", ses->serverOS); + cifs_dbg(FYI, "serverOS=%s\n", ses->serverOS); len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; data += len; bleft -= len; @@ -296,7 +295,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses, kfree(ses->serverNOS); ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp); - cFYI(1, "serverNOS=%s", ses->serverNOS); + cifs_dbg(FYI, "serverNOS=%s\n", ses->serverNOS); len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; data += len; bleft -= len; @@ -305,41 +304,38 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses, kfree(ses->serverDomain); ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp); - cFYI(1, "serverDomain=%s", ses->serverDomain); + cifs_dbg(FYI, "serverDomain=%s\n", ses->serverDomain); return; } -static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft, - struct cifs_ses *ses, - const struct nls_table *nls_cp) +static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft, + struct cifs_ses *ses, + const struct nls_table *nls_cp) { - int rc = 0; int len; char *bcc_ptr = *pbcc_area; - cFYI(1, "decode sessetup ascii. bleft %d", bleft); + cifs_dbg(FYI, "decode sessetup ascii. bleft %d\n", bleft); len = strnlen(bcc_ptr, bleft); if (len >= bleft) - return rc; + return; kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1, GFP_KERNEL); if (ses->serverOS) strncpy(ses->serverOS, bcc_ptr, len); - if (strncmp(ses->serverOS, "OS/2", 4) == 0) { - cFYI(1, "OS/2 server"); - ses->flags |= CIFS_SES_OS2; - } + if (strncmp(ses->serverOS, "OS/2", 4) == 0) + cifs_dbg(FYI, "OS/2 server\n"); bcc_ptr += len + 1; bleft -= len + 1; len = strnlen(bcc_ptr, bleft); if (len >= bleft) - return rc; + return; kfree(ses->serverNOS); @@ -352,16 +348,14 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft, len = strnlen(bcc_ptr, bleft); if (len > bleft) - return rc; + return; /* No domain field in LANMAN case. Domain is returned by old servers in the SMB negprot response */ /* BB For newer servers which do not support Unicode, but thus do return domain here we could add parsing for it later, but it is not very important */ - cFYI(1, "ascii: bytes left %d", bleft); - - return rc; + cifs_dbg(FYI, "ascii: bytes left %d\n", bleft); } int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, @@ -373,16 +367,18 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr; if (blob_len < sizeof(CHALLENGE_MESSAGE)) { - cERROR(1, "challenge blob len %d too small", blob_len); + cifs_dbg(VFS, "challenge blob len %d too small\n", blob_len); return -EINVAL; } if (memcmp(pblob->Signature, "NTLMSSP", 8)) { - cERROR(1, "blob signature incorrect %s", pblob->Signature); + cifs_dbg(VFS, "blob signature incorrect %s\n", + pblob->Signature); return -EINVAL; } if (pblob->MessageType != NtLmChallenge) { - cERROR(1, "Incorrect message type %d", pblob->MessageType); + cifs_dbg(VFS, "Incorrect message type %d\n", + pblob->MessageType); return -EINVAL; } @@ -395,16 +391,17 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset); tilen = le16_to_cpu(pblob->TargetInfoArray.Length); if (tioffset > blob_len || tioffset + tilen > blob_len) { - cERROR(1, "tioffset + tilen too high %u + %u", tioffset, tilen); + cifs_dbg(VFS, "tioffset + tilen too high %u + %u", + tioffset, tilen); return -EINVAL; } if (tilen) { - ses->auth_key.response = kmalloc(tilen, GFP_KERNEL); + ses->auth_key.response = kmemdup(bcc_ptr + tioffset, tilen, + GFP_KERNEL); if (!ses->auth_key.response) { - cERROR(1, "Challenge target info allocation failure"); + cifs_dbg(VFS, "Challenge target info alloc failure"); return -ENOMEM; } - memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen); ses->auth_key.len = tilen; } @@ -429,8 +426,7 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; - if (ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + if (ses->server->sign) { flags |= NTLMSSP_NEGOTIATE_SIGN; if (!ses->server->session_estab) flags |= NTLMSSP_NEGOTIATE_KEY_XCH; @@ -468,8 +464,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; - if (ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + if (ses->server->sign) { flags |= NTLMSSP_NEGOTIATE_SIGN; if (!ses->server->session_estab) flags |= NTLMSSP_NEGOTIATE_KEY_XCH; @@ -486,7 +481,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); rc = setup_ntlmv2_rsp(ses, nls_cp); if (rc) { - cERROR(1, "Error %d during NTLMSSP authentication", rc); + cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc); goto setup_ntlmv2_ret; } memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE, @@ -555,6 +550,56 @@ setup_ntlmv2_ret: return rc; } +enum securityEnum +select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) +{ + switch (server->negflavor) { + case CIFS_NEGFLAVOR_EXTENDED: + switch (requested) { + case Kerberos: + case RawNTLMSSP: + return requested; + case Unspecified: + if (server->sec_ntlmssp && + (global_secflags & CIFSSEC_MAY_NTLMSSP)) + return RawNTLMSSP; + if ((server->sec_kerberos || server->sec_mskerberos) && + (global_secflags & CIFSSEC_MAY_KRB5)) + return Kerberos; + /* Fallthrough */ + default: + return Unspecified; + } + case CIFS_NEGFLAVOR_UNENCAP: + switch (requested) { + case NTLM: + case NTLMv2: + return requested; + case Unspecified: + if (global_secflags & CIFSSEC_MAY_NTLMV2) + return NTLMv2; + if (global_secflags & CIFSSEC_MAY_NTLM) + return NTLM; + /* Fallthrough */ + default: + return Unspecified; + } + case CIFS_NEGFLAVOR_LANMAN: + switch (requested) { + case LANMAN: + return requested; + case Unspecified: + if (global_secflags & CIFSSEC_MAY_LANMAN) + return LANMAN; + /* Fallthrough */ + default: + return Unspecified; + } + default: + return Unspecified; + } +} + int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_cp) @@ -576,11 +621,18 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, u16 blob_len; char *ntlmsspblob = NULL; - if (ses == NULL) + if (ses == NULL) { + WARN(1, "%s: ses == NULL!", __func__); + return -EINVAL; + } + + type = select_sectype(ses->server, ses->sectype); + cifs_dbg(FYI, "sess setup type %d\n", type); + if (type == Unspecified) { + cifs_dbg(VFS, "Unable to select appropriate authentication method!"); return -EINVAL; + } - type = ses->server->secType; - cFYI(1, "sess setup type %d", type); if (type == RawNTLMSSP) { /* if memory allocation is successful, caller of this function * frees it. @@ -640,8 +692,6 @@ ssetup_ntlmssp_authenticate: } bcc_ptr = str_area; - ses->flags &= ~CIFS_SES_LANMAN; - iov[1].iov_base = NULL; iov[1].iov_len = 0; @@ -665,7 +715,6 @@ ssetup_ntlmssp_authenticate: ses->server->sec_mode & SECMODE_PW_ENCRYPT ? true : false, lnm_session_key); - ses->flags |= CIFS_SES_LANMAN; memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); bcc_ptr += CIFS_AUTH_RESP_SIZE; @@ -674,7 +723,7 @@ ssetup_ntlmssp_authenticate: changed to do higher than lanman dialect and we reconnected would we ever calc signing_key? */ - cFYI(1, "Negotiating LANMAN setting up strings"); + cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n"); /* Unicode not allowed for LANMAN dialects */ ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); #endif @@ -688,7 +737,8 @@ ssetup_ntlmssp_authenticate: /* calculate ntlm response and session key */ rc = setup_ntlm_response(ses, nls_cp); if (rc) { - cERROR(1, "Error %d during NTLM authentication", rc); + cifs_dbg(VFS, "Error %d during NTLM authentication\n", + rc); goto ssetup_exit; } @@ -718,7 +768,8 @@ ssetup_ntlmssp_authenticate: /* calculate nlmv2 response and session key */ rc = setup_ntlmv2_rsp(ses, nls_cp); if (rc) { - cERROR(1, "Error %d during NTLMv2 authentication", rc); + cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", + rc); goto ssetup_exit; } memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, @@ -754,21 +805,21 @@ ssetup_ntlmssp_authenticate: /* check version field to make sure that cifs.upcall is sending us a response in an expected form */ if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { - cERROR(1, "incorrect version of cifs.upcall (expected" - " %d but got %d)", + cifs_dbg(VFS, "incorrect version of cifs.upcall " + "expected %d but got %d)", CIFS_SPNEGO_UPCALL_VERSION, msg->version); rc = -EKEYREJECTED; goto ssetup_exit; } - ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL); + ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, + GFP_KERNEL); if (!ses->auth_key.response) { - cERROR(1, "Kerberos can't allocate (%u bytes) memory", + cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory", msg->sesskey_len); rc = -ENOMEM; goto ssetup_exit; } - memcpy(ses->auth_key.response, msg->data, msg->sesskey_len); ses->auth_key.len = msg->sesskey_len; pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; @@ -790,18 +841,18 @@ ssetup_ntlmssp_authenticate: /* BB: is this right? */ ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); #else /* ! CONFIG_CIFS_UPCALL */ - cERROR(1, "Kerberos negotiated but upcall support disabled!"); + cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); rc = -ENOSYS; goto ssetup_exit; #endif /* CONFIG_CIFS_UPCALL */ } else if (type == RawNTLMSSP) { if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { - cERROR(1, "NTLMSSP requires Unicode support"); + cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); rc = -ENOSYS; goto ssetup_exit; } - cFYI(1, "ntlmssp session setup phase %d", phase); + cifs_dbg(FYI, "ntlmssp session setup phase %d\n", phase); pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; capabilities |= CAP_EXTENDED_SECURITY; pSMB->req.Capabilities |= cpu_to_le32(capabilities); @@ -824,7 +875,6 @@ ssetup_ntlmssp_authenticate: 5*sizeof(struct _AUTHENTICATE_MESSAGE), GFP_KERNEL); if (!ntlmsspblob) { - cERROR(1, "Can't allocate NTLMSSP blob"); rc = -ENOMEM; goto ssetup_exit; } @@ -844,7 +894,7 @@ ssetup_ntlmssp_authenticate: smb_buf->Uid = ses->Suid; break; default: - cERROR(1, "invalid phase %d", phase); + cifs_dbg(VFS, "invalid phase %d\n", phase); rc = -ENOSYS; goto ssetup_exit; } @@ -855,7 +905,7 @@ ssetup_ntlmssp_authenticate: } unicode_oslm_strings(&bcc_ptr, nls_cp); } else { - cERROR(1, "secType %d not supported!", type); + cifs_dbg(VFS, "secType %d not supported!\n", type); rc = -ENOSYS; goto ssetup_exit; } @@ -880,7 +930,7 @@ ssetup_ntlmssp_authenticate: (smb_buf->Status.CifsError == cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) { if (phase != NtLmNegotiate) { - cERROR(1, "Unexpected more processing error"); + cifs_dbg(VFS, "Unexpected more processing error\n"); goto ssetup_exit; } /* NTLMSSP Negotiate sent now processing challenge (response) */ @@ -892,14 +942,14 @@ ssetup_ntlmssp_authenticate: if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) { rc = -EIO; - cERROR(1, "bad word count %d", smb_buf->WordCount); + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); goto ssetup_exit; } action = le16_to_cpu(pSMB->resp.Action); if (action & GUEST_LOGIN) - cFYI(1, "Guest login"); /* BB mark SesInfo struct? */ + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ - cFYI(1, "UID = %llu ", ses->Suid); + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); /* response can have either 3 or 4 word count - Samba sends 3 */ /* and lanman response is 3 */ bytes_remaining = get_bcc(smb_buf); @@ -908,7 +958,8 @@ ssetup_ntlmssp_authenticate: if (smb_buf->WordCount == 4) { blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); if (blob_len > bytes_remaining) { - cERROR(1, "bad security blob length %d", blob_len); + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); rc = -EINVAL; goto ssetup_exit; } @@ -933,8 +984,7 @@ ssetup_ntlmssp_authenticate: } decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); } else { - rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining, - ses, nls_cp); + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); } ssetup_exit: @@ -946,7 +996,7 @@ ssetup_exit: kfree(ntlmsspblob); ntlmsspblob = NULL; if (resp_buf_type == CIFS_SMALL_BUFFER) { - cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base); + cifs_dbg(FYI, "ssetup freeing small buf %p\n", iov[0].iov_base); cifs_small_buf_release(iov[0].iov_base); } else if (resp_buf_type == CIFS_LARGE_BUFFER) cifs_buf_release(iov[0].iov_base); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 47bc5a87f94e..e813f04511d8 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -61,10 +61,13 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf, */ --server->sequence_number; rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); + if (rc < 0) + server->sequence_number--; + mutex_unlock(&server->srv_mutex); - cFYI(1, "issued NT_CANCEL for mid %u, rc = %d", - in_buf->Mid, rc); + cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n", + in_buf->Mid, rc); return rc; } @@ -249,7 +252,7 @@ check2ndT2(char *buf) /* check for plausible wct, bcc and t2 data and parm sizes */ /* check for parm and data offset going beyond end of smb */ if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */ - cFYI(1, "invalid transact2 word count"); + cifs_dbg(FYI, "invalid transact2 word count\n"); return -EINVAL; } @@ -261,18 +264,18 @@ check2ndT2(char *buf) if (total_data_size == data_in_this_rsp) return 0; else if (total_data_size < data_in_this_rsp) { - cFYI(1, "total data %d smaller than data in frame %d", - total_data_size, data_in_this_rsp); + cifs_dbg(FYI, "total data %d smaller than data in frame %d\n", + total_data_size, data_in_this_rsp); return -EINVAL; } remaining = total_data_size - data_in_this_rsp; - cFYI(1, "missing %d bytes from transact2, check next response", - remaining); + cifs_dbg(FYI, "missing %d bytes from transact2, check next response\n", + remaining); if (total_data_size > CIFSMaxBufSize) { - cERROR(1, "TotalDataSize %d is over maximum buffer %d", - total_data_size, CIFSMaxBufSize); + cifs_dbg(VFS, "TotalDataSize %d is over maximum buffer %d\n", + total_data_size, CIFSMaxBufSize); return -EINVAL; } return remaining; @@ -293,28 +296,28 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); if (tgt_total_cnt != src_total_cnt) - cFYI(1, "total data count of primary and secondary t2 differ " - "source=%hu target=%hu", src_total_cnt, tgt_total_cnt); + cifs_dbg(FYI, "total data count of primary and secondary t2 differ source=%hu target=%hu\n", + src_total_cnt, tgt_total_cnt); total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount); remaining = tgt_total_cnt - total_in_tgt; if (remaining < 0) { - cFYI(1, "Server sent too much data. tgt_total_cnt=%hu " - "total_in_tgt=%hu", tgt_total_cnt, total_in_tgt); + cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n", + tgt_total_cnt, total_in_tgt); return -EPROTO; } if (remaining == 0) { /* nothing to do, ignore */ - cFYI(1, "no more data remains"); + cifs_dbg(FYI, "no more data remains\n"); return 0; } total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount); if (remaining < total_in_src) - cFYI(1, "transact2 2nd response contains too much data"); + cifs_dbg(FYI, "transact2 2nd response contains too much data\n"); /* find end of first SMB data area */ data_area_of_tgt = (char *)&pSMBt->hdr.Protocol + @@ -329,7 +332,8 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) total_in_tgt += total_in_src; /* is the result too big for the field? */ if (total_in_tgt > USHRT_MAX) { - cFYI(1, "coalesced DataCount too large (%u)", total_in_tgt); + cifs_dbg(FYI, "coalesced DataCount too large (%u)\n", + total_in_tgt); return -EPROTO; } put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount); @@ -339,7 +343,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) byte_count += total_in_src; /* is the result too big for the field? */ if (byte_count > USHRT_MAX) { - cFYI(1, "coalesced BCC too large (%u)", byte_count); + cifs_dbg(FYI, "coalesced BCC too large (%u)\n", byte_count); return -EPROTO; } put_bcc(byte_count, target_hdr); @@ -348,7 +352,8 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) byte_count += total_in_src; /* don't allow buffer to overflow */ if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - cFYI(1, "coalesced BCC exceeds buffer size (%u)", byte_count); + cifs_dbg(FYI, "coalesced BCC exceeds buffer size (%u)\n", + byte_count); return -ENOBUFS; } target_hdr->smb_buf_length = cpu_to_be32(byte_count); @@ -358,12 +363,12 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) if (remaining != total_in_src) { /* more responses to go */ - cFYI(1, "waiting for more secondary responses"); + cifs_dbg(FYI, "waiting for more secondary responses\n"); return 1; } /* we are done */ - cFYI(1, "found the last secondary response"); + cifs_dbg(FYI, "found the last secondary response\n"); return 0; } @@ -388,7 +393,7 @@ cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server, } if (!server->large_buf) { /*FIXME: switch to already allocated largebuf?*/ - cERROR(1, "1st trans2 resp needs bigbuf"); + cifs_dbg(VFS, "1st trans2 resp needs bigbuf\n"); } else { /* Have first buffer */ mid->resp_buf = buf; @@ -444,8 +449,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) * WRITEX header, not including the 4 byte RFC1001 length. */ if (!(server->capabilities & CAP_LARGE_WRITE_X) || - (!(server->capabilities & CAP_UNIX) && - (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)))) + (!(server->capabilities & CAP_UNIX) && server->sign)) wsize = min_t(unsigned int, wsize, server->maxBuf - sizeof(WRITE_REQ) + 4); @@ -760,24 +764,17 @@ smb_set_file_info(struct inode *inode, const char *full_path, } tcon = tlink_tcon(tlink); - /* - * NT4 apparently returns success on this call, but it doesn't really - * work. - */ - if (!(tcon->ses->flags & CIFS_SES_NT4)) { - rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, - cifs_sb->local_nls, + rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - cinode->cifsAttrs = le32_to_cpu(buf->Attributes); - goto out; - } else if (rc != -EOPNOTSUPP && rc != -EINVAL) - goto out; + if (rc == 0) { + cinode->cifsAttrs = le32_to_cpu(buf->Attributes); + goto out; + } else if (rc != -EOPNOTSUPP && rc != -EINVAL) { + goto out; } - cFYI(1, "calling SetFileInfo since SetPathInfo for times not supported " - "by this server"); + cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, &netfid, &oplock, NULL, cifs_sb->local_nls, @@ -960,4 +957,6 @@ struct smb_version_values smb1_values = { .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND, .cap_large_files = CAP_LARGE_FILES, .oplock_read = OPLOCK_READ, + .signing_enabled = SECMODE_SIGN_ENABLED, + .signing_required = SECMODE_SIGN_REQUIRED, }; diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 71e6aed4b382..5da1b55a2258 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -43,13 +43,13 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) { cinode->clientCanCacheAll = true; cinode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", - &cinode->vfs_inode); + cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", + &cinode->vfs_inode); } else if (oplock == SMB2_OPLOCK_LEVEL_II) { cinode->clientCanCacheAll = false; cinode->clientCanCacheRead = true; - cFYI(1, "Level II Oplock granted on inode %p", - &cinode->vfs_inode); + cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", + &cinode->vfs_inode); } else { cinode->clientCanCacheAll = false; cinode->clientCanCacheRead = false; diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index 7c0e2143e775..c38350851b08 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -54,5 +54,7 @@ #define SMB2_SIGNATURE_SIZE (16) #define SMB2_NTLMV2_SESSKEY_SIZE (16) #define SMB2_HMACSHA256_SIZE (32) +#define SMB2_CMACAES_SIZE (16) +#define SMB3_SIGNKEY_SIZE (16) #endif /* _SMB2_GLOB_H */ diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 706482452df4..fff6dfba6204 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -92,7 +92,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, (FILE_BASIC_INFO *)data); break; default: - cERROR(1, "Invalid command"); + cifs_dbg(VFS, "Invalid command\n"); break; } diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 494c912c76fe..7c2f45c06fc2 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -2472,7 +2472,7 @@ map_smb2_to_linux_error(char *buf, bool log_err) /* on error mapping not found - return EIO */ - cFYI(1, "Mapping SMB2 status code %d to POSIX err %d", + cifs_dbg(FYI, "Mapping SMB2 status code %d to POSIX err %d\n", smb2err, rc); return rc; diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 7b1c5e3287fb..b0c43345cd98 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -45,17 +45,17 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) if (hdr->Command == SMB2_OPLOCK_BREAK) return 0; else - cERROR(1, "Received Request not response"); + cifs_dbg(VFS, "Received Request not response\n"); } } else { /* bad signature or mid */ if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER) - cERROR(1, "Bad protocol string signature header %x", - *(unsigned int *) hdr->ProtocolId); + cifs_dbg(VFS, "Bad protocol string signature header %x\n", + *(unsigned int *) hdr->ProtocolId); if (mid != hdr->MessageId) - cERROR(1, "Mids do not match: %llu and %llu", mid, - hdr->MessageId); + cifs_dbg(VFS, "Mids do not match: %llu and %llu\n", + mid, hdr->MessageId); } - cERROR(1, "Bad SMB detected. The Mid=%llu", hdr->MessageId); + cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", hdr->MessageId); return 1; } @@ -101,7 +101,8 @@ smb2_check_message(char *buf, unsigned int length) int command; /* BB disable following printk later */ - cFYI(1, "%s length: 0x%x, smb_buf_length: 0x%x", __func__, length, len); + cifs_dbg(FYI, "%s length: 0x%x, smb_buf_length: 0x%x\n", + __func__, length, len); /* * Add function to do table lookup of StructureSize by command @@ -117,12 +118,13 @@ smb2_check_message(char *buf, unsigned int length) */ return 0; } else { - cERROR(1, "Length less than SMB header size"); + cifs_dbg(VFS, "Length less than SMB header size\n"); } return 1; } if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - 4) { - cERROR(1, "SMB length greater than maximum, mid=%llu", mid); + cifs_dbg(VFS, "SMB length greater than maximum, mid=%llu\n", + mid); return 1; } @@ -130,14 +132,14 @@ smb2_check_message(char *buf, unsigned int length) return 1; if (hdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) { - cERROR(1, "Illegal structure size %u", - le16_to_cpu(hdr->StructureSize)); + cifs_dbg(VFS, "Illegal structure size %u\n", + le16_to_cpu(hdr->StructureSize)); return 1; } command = le16_to_cpu(hdr->Command); if (command >= NUMBER_OF_SMB2_COMMANDS) { - cERROR(1, "Illegal SMB2 command %d", command); + cifs_dbg(VFS, "Illegal SMB2 command %d\n", command); return 1; } @@ -145,30 +147,30 @@ smb2_check_message(char *buf, unsigned int length) if (command != SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0 || pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) { /* error packets have 9 byte structure size */ - cERROR(1, "Illegal response size %u for command %d", - le16_to_cpu(pdu->StructureSize2), command); + cifs_dbg(VFS, "Illegal response size %u for command %d\n", + le16_to_cpu(pdu->StructureSize2), command); return 1; } else if (command == SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0) && (le16_to_cpu(pdu->StructureSize2) != 44) && (le16_to_cpu(pdu->StructureSize2) != 36)) { /* special case for SMB2.1 lease break message */ - cERROR(1, "Illegal response size %d for oplock break", - le16_to_cpu(pdu->StructureSize2)); + cifs_dbg(VFS, "Illegal response size %d for oplock break\n", + le16_to_cpu(pdu->StructureSize2)); return 1; } } if (4 + len != length) { - cERROR(1, "Total length %u RFC1002 length %u mismatch mid %llu", - length, 4 + len, mid); + cifs_dbg(VFS, "Total length %u RFC1002 length %u mismatch mid %llu\n", + length, 4 + len, mid); return 1; } clc_len = smb2_calc_size(hdr); if (4 + len != clc_len) { - cFYI(1, "Calculated size %u length %u mismatch mid %llu", - clc_len, 4 + len, mid); + cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n", + clc_len, 4 + len, mid); /* Windows 7 server returns 24 bytes more */ if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE) return 0; @@ -264,10 +266,14 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength); break; case SMB2_IOCTL: + *off = le32_to_cpu( + ((struct smb2_ioctl_rsp *)hdr)->OutputOffset); + *len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount); + break; case SMB2_CHANGE_NOTIFY: default: /* BB FIXME for unimplemented cases above */ - cERROR(1, "no length check for command"); + cifs_dbg(VFS, "no length check for command\n"); break; } @@ -276,20 +282,20 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) * we have little choice but to ignore the data area in this case. */ if (*off > 4096) { - cERROR(1, "offset %d too large, data area ignored", *off); + cifs_dbg(VFS, "offset %d too large, data area ignored\n", *off); *len = 0; *off = 0; } else if (*off < 0) { - cERROR(1, "negative offset %d to data invalid ignore data area", - *off); + cifs_dbg(VFS, "negative offset %d to data invalid ignore data area\n", + *off); *off = 0; *len = 0; } else if (*len < 0) { - cERROR(1, "negative data length %d invalid, data area ignored", - *len); + cifs_dbg(VFS, "negative data length %d invalid, data area ignored\n", + *len); *len = 0; } else if (*len > 128 * 1024) { - cERROR(1, "data area larger than 128K: %d", *len); + cifs_dbg(VFS, "data area larger than 128K: %d\n", *len); *len = 0; } @@ -324,7 +330,7 @@ smb2_calc_size(void *buf) goto calc_size_exit; smb2_get_data_area_len(&offset, &data_length, hdr); - cFYI(1, "SMB2 data length %d offset %d", data_length, offset); + cifs_dbg(FYI, "SMB2 data length %d offset %d\n", data_length, offset); if (data_length > 0) { /* @@ -335,15 +341,15 @@ smb2_calc_size(void *buf) * the size of the RFC1001 hdr. */ if (offset + 4 + 1 < len) { - cERROR(1, "data area offset %d overlaps SMB2 header %d", - offset + 4 + 1, len); + cifs_dbg(VFS, "data area offset %d overlaps SMB2 header %d\n", + offset + 4 + 1, len); data_length = 0; } else { len = 4 + offset + data_length; } } calc_size_exit: - cFYI(1, "SMB2 len %d", len); + cifs_dbg(FYI, "SMB2 len %d\n", len); return len; } @@ -405,7 +411,7 @@ cifs_ses_oplock_break(struct work_struct *work) rc = SMB2_lease_break(0, tlink_tcon(lw->tlink), lw->lease_key, lw->lease_state); - cFYI(1, "Lease release rc %d", rc); + cifs_dbg(FYI, "Lease release rc %d\n", rc); cifs_put_tlink(lw->tlink); kfree(lw); } @@ -426,15 +432,13 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED); lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL); - if (!lw) { - cERROR(1, "Memory allocation failed during lease break check"); + if (!lw) return false; - } INIT_WORK(&lw->lease_break, cifs_ses_oplock_break); lw->lease_state = rsp->NewLeaseState; - cFYI(1, "Checking for lease break"); + cifs_dbg(FYI, "Checking for lease break\n"); /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); @@ -455,9 +459,9 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) SMB2_LEASE_KEY_SIZE)) continue; - cFYI(1, "found in the open list"); - cFYI(1, "lease key match, lease break 0x%d", - le32_to_cpu(rsp->NewLeaseState)); + cifs_dbg(FYI, "found in the open list\n"); + cifs_dbg(FYI, "lease key match, lease break 0x%d\n", + le32_to_cpu(rsp->NewLeaseState)); smb2_set_oplock_level(cinode, smb2_map_lease_to_oplock(rsp->NewLeaseState)); @@ -489,9 +493,9 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) &lw->lease_break); } - cFYI(1, "found in the pending open list"); - cFYI(1, "lease key match, lease break 0x%d", - le32_to_cpu(rsp->NewLeaseState)); + cifs_dbg(FYI, "found in the pending open list\n"); + cifs_dbg(FYI, "lease key match, lease break 0x%d\n", + le32_to_cpu(rsp->NewLeaseState)); open->oplock = smb2_map_lease_to_oplock(rsp->NewLeaseState); @@ -506,7 +510,7 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) } spin_unlock(&cifs_tcp_ses_lock); kfree(lw); - cFYI(1, "Can not process lease break - no lease matched"); + cifs_dbg(FYI, "Can not process lease break - no lease matched\n"); return false; } @@ -520,7 +524,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) struct cifsInodeInfo *cinode; struct cifsFileInfo *cfile; - cFYI(1, "Checking for oplock break"); + cifs_dbg(FYI, "Checking for oplock break\n"); if (rsp->hdr.Command != SMB2_OPLOCK_BREAK) return false; @@ -533,7 +537,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) return false; } - cFYI(1, "oplock level 0x%d", rsp->OplockLevel); + cifs_dbg(FYI, "oplock level 0x%d\n", rsp->OplockLevel); /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); @@ -553,7 +557,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cfile->fid.volatile_fid) continue; - cFYI(1, "file id match, oplock break"); + cifs_dbg(FYI, "file id match, oplock break\n"); cinode = CIFS_I(cfile->dentry->d_inode); if (!cinode->clientCanCacheAll && @@ -573,11 +577,11 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) } spin_unlock(&cifs_file_list_lock); spin_unlock(&cifs_tcp_ses_lock); - cFYI(1, "No matching file for oplock break"); + cifs_dbg(FYI, "No matching file for oplock break\n"); return true; } } spin_unlock(&cifs_tcp_ses_lock); - cFYI(1, "Can not process oplock break for non-existent connection"); + cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n"); return false; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index bceffe7b8f8d..6d15cab95b99 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -38,13 +38,13 @@ change_conf(struct TCP_Server_Info *server) case 1: server->echoes = false; server->oplocks = false; - cERROR(1, "disabling echoes and oplocks"); + cifs_dbg(VFS, "disabling echoes and oplocks\n"); break; case 2: server->echoes = true; server->oplocks = false; server->echo_credits = 1; - cFYI(1, "disabling oplocks"); + cifs_dbg(FYI, "disabling oplocks\n"); break; default: server->echoes = true; @@ -147,10 +147,10 @@ smb2_dump_detail(void *buf) #ifdef CONFIG_CIFS_DEBUG2 struct smb2_hdr *smb = (struct smb2_hdr *)buf; - cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d", - smb->Command, smb->Status, smb->Flags, smb->MessageId, - smb->ProcessId); - cERROR(1, "smb buf %p len %u", smb, smb2_calc_size(smb)); + cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n", + smb->Command, smb->Status, smb->Flags, smb->MessageId, + smb->ProcessId); + cifs_dbg(VFS, "smb buf %p len %u\n", smb, smb2_calc_size(smb)); #endif } @@ -281,6 +281,25 @@ smb2_clear_stats(struct cifs_tcon *tcon) } static void +smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon) +{ + seq_puts(m, "\n\tShare Capabilities:"); + if (tcon->capabilities & SMB2_SHARE_CAP_DFS) + seq_puts(m, " DFS,"); + if (tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY) + seq_puts(m, " CONTINUOUS AVAILABILITY,"); + if (tcon->capabilities & SMB2_SHARE_CAP_SCALEOUT) + seq_puts(m, " SCALEOUT,"); + if (tcon->capabilities & SMB2_SHARE_CAP_CLUSTER) + seq_puts(m, " CLUSTER,"); + if (tcon->capabilities & SMB2_SHARE_CAP_ASYMMETRIC) + seq_puts(m, " ASYMMETRIC,"); + if (tcon->capabilities == 0) + seq_puts(m, " None"); + seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags); +} + +static void smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon) { #ifdef CONFIG_CIFS_STATS @@ -292,7 +311,6 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon) seq_printf(m, "\nSessionSetups: %d sent %d failed", atomic_read(&sent[SMB2_SESSION_SETUP_HE]), atomic_read(&failed[SMB2_SESSION_SETUP_HE])); -#define SMB2LOGOFF 0x0002 /* trivial request/resp */ seq_printf(m, "\nLogoffs: %d sent %d failed", atomic_read(&sent[SMB2_LOGOFF_HE]), atomic_read(&failed[SMB2_LOGOFF_HE])); @@ -436,7 +454,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, &oplock, NULL); kfree(utf16_path); if (rc) { - cERROR(1, "open dir failed"); + cifs_dbg(VFS, "open dir failed\n"); return rc; } @@ -448,7 +466,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_query_directory(xid, tcon, persistent_fid, volatile_fid, 0, srch_inf); if (rc) { - cERROR(1, "query directory failed"); + cifs_dbg(VFS, "query directory failed\n"); SMB2_close(xid, tcon, persistent_fid, volatile_fid); } return rc; @@ -645,6 +663,7 @@ struct smb_version_operations smb30_operations = { .dump_detail = smb2_dump_detail, .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, + .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, @@ -690,6 +709,7 @@ struct smb_version_operations smb30_operations = { .get_lease_key = smb2_get_lease_key, .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, + .generate_signingkey = generate_smb3signingkey, .calc_signature = smb3_calc_signature, }; @@ -709,6 +729,8 @@ struct smb_version_values smb20_values = { .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, .oplock_read = SMB2_OPLOCK_LEVEL_II, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, }; struct smb_version_values smb21_values = { @@ -727,6 +749,8 @@ struct smb_version_values smb21_values = { .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, .oplock_read = SMB2_OPLOCK_LEVEL_II, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, }; struct smb_version_values smb30_values = { @@ -745,4 +769,26 @@ struct smb_version_values smb30_values = { .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, .oplock_read = SMB2_OPLOCK_LEVEL_II, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, +}; + +struct smb_version_values smb302_values = { + .version_string = SMB302_VERSION_STRING, + .protocol_id = SMB302_PROT_ID, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .oplock_read = SMB2_OPLOCK_LEVEL_II, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, }; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 41d9d0725f0f..2b312e4eeaa6 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1,7 +1,7 @@ /* * fs/cifs/smb2pdu.c * - * Copyright (C) International Business Machines Corp., 2009, 2012 + * Copyright (C) International Business Machines Corp., 2009, 2013 * Etersoft, 2012 * Author(s): Steve French (sfrench@us.ibm.com) * Pavel Shilovsky (pshilovsky@samba.org) 2012 @@ -108,19 +108,33 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , if (!tcon) goto out; + /* BB FIXME when we do write > 64K add +1 for every 64K in req or rsp */ + /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */ + /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */ + if ((tcon->ses) && + (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) + hdr->CreditCharge = cpu_to_le16(1); + /* else CreditCharge MBZ */ + hdr->TreeId = tcon->tid; /* Uid is not converted */ if (tcon->ses) hdr->SessionId = tcon->ses->Suid; - /* BB check following DFS flags BB */ - /* BB do we have to add check for SHI1005_FLAGS_DFS_ROOT too? */ - if (tcon->share_flags & SHI1005_FLAGS_DFS) - hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; - /* BB how does SMB2 do case sensitive? */ - /* if (tcon->nocase) - hdr->Flags |= SMBFLG_CASELESS; */ - if (tcon->ses && tcon->ses->server && - (tcon->ses->server->sec_mode & SECMODE_SIGN_REQUIRED)) + + /* + * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have + * to pass the path on the Open SMB prefixed by \\server\share. + * Not sure when we would need to do the augmented path (if ever) and + * setting this flag breaks the SMB2 open operation since it is + * illegal to send an empty path name (without \\server\share prefix) + * when the DFS flag is set in the SMB open header. We could + * consider setting the flag on all operations other than open + * but it is safer to net set it for now. + */ +/* if (tcon->share_flags & SHI1005_FLAGS_DFS) + hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */ + + if (tcon->ses && tcon->ses->server && tcon->ses->server->sign) hdr->Flags |= SMB2_FLAGS_SIGNED; out: pdu->StructureSize2 = cpu_to_le16(parmsize); @@ -155,8 +169,8 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) if ((smb2_command != SMB2_WRITE) && (smb2_command != SMB2_CREATE) && (smb2_command != SMB2_TREE_DISCONNECT)) { - cFYI(1, "can not send cmd %d while umounting", - smb2_command); + cifs_dbg(FYI, "can not send cmd %d while umounting\n", + smb2_command); return -ENODEV; } } @@ -200,7 +214,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) * back on-line */ if (!tcon->retry) { - cFYI(1, "gave up waiting on reconnect in smb_init"); + cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n"); return -EHOSTDOWN; } } @@ -227,7 +241,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) cifs_mark_open_files_invalid(tcon); rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage); mutex_unlock(&tcon->ses->session_mutex); - cFYI(1, "reconnect tcon rc = %d", rc); + cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) goto out; atomic_inc(&tconInfoReconnectCount); @@ -328,34 +342,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) struct kvec iov[1]; int rc = 0; int resp_buftype; - struct TCP_Server_Info *server; - unsigned int sec_flags; - u16 temp = 0; + struct TCP_Server_Info *server = ses->server; int blob_offset, blob_length; char *security_blob; int flags = CIFS_NEG_OP; - cFYI(1, "Negotiate protocol"); + cifs_dbg(FYI, "Negotiate protocol\n"); - if (ses->server) - server = ses->server; - else { - rc = -EIO; - return rc; + if (!server) { + WARN(1, "%s: server is NULL!\n", __func__); + return -EIO; } rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req); if (rc) return rc; - /* if any of auth flags (ie not sign or seal) are overriden use them */ - if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) - sec_flags = ses->overrideSecFlg; /* BB FIXME fix sign flags?*/ - else /* if override flags set only sign/seal OR them with global auth */ - sec_flags = global_secflags | ses->overrideSecFlg; - - cFYI(1, "sec_flags 0x%x", sec_flags); - req->hdr.SessionId = 0; req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id); @@ -364,12 +366,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) inc_rfc1001_len(req, 2); /* only one of SMB2 signing flags may be set in SMB2 request */ - if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) - temp = SMB2_NEGOTIATE_SIGNING_REQUIRED; - else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */ - temp = SMB2_NEGOTIATE_SIGNING_ENABLED; - - req->SecurityMode = cpu_to_le16(temp); + if (ses->sign) + req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED); + else if (global_secflags & CIFSSEC_MAY_SIGN) + req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED); + else + req->SecurityMode = 0; req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities); @@ -389,24 +391,28 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) if (rc != 0) goto neg_exit; - cFYI(1, "mode 0x%x", rsp->SecurityMode); + cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode); /* BB we may eventually want to match the negotiated vs. requested dialect, even though we are only requesting one at a time */ if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) - cFYI(1, "negotiated smb2.0 dialect"); + cifs_dbg(FYI, "negotiated smb2.0 dialect\n"); else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) - cFYI(1, "negotiated smb2.1 dialect"); + cifs_dbg(FYI, "negotiated smb2.1 dialect\n"); else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID)) - cFYI(1, "negotiated smb3.0 dialect"); + cifs_dbg(FYI, "negotiated smb3.0 dialect\n"); + else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID)) + cifs_dbg(FYI, "negotiated smb3.02 dialect\n"); else { - cERROR(1, "Illegal dialect returned by server %d", - le16_to_cpu(rsp->DialectRevision)); + cifs_dbg(VFS, "Illegal dialect returned by server %d\n", + le16_to_cpu(rsp->DialectRevision)); rc = -EIO; goto neg_exit; } server->dialect = le16_to_cpu(rsp->DialectRevision); + /* SMB2 only has an extended negflavor */ + server->negflavor = CIFS_NEGFLAVOR_EXTENDED; server->maxBuf = le32_to_cpu(rsp->MaxTransactSize); server->max_read = le32_to_cpu(rsp->MaxReadSize); server->max_write = le32_to_cpu(rsp->MaxWriteSize); @@ -418,45 +424,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, &rsp->hdr); - if (blob_length == 0) { - cERROR(1, "missing security blob on negprot"); - rc = -EIO; - goto neg_exit; - } - - cFYI(1, "sec_flags 0x%x", sec_flags); - if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) { - cFYI(1, "Signing required"); - if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED | - SMB2_NEGOTIATE_SIGNING_ENABLED))) { - cERROR(1, "signing required but server lacks support"); - rc = -EOPNOTSUPP; - goto neg_exit; - } - server->sec_mode |= SECMODE_SIGN_REQUIRED; - } else if (sec_flags & CIFSSEC_MAY_SIGN) { - cFYI(1, "Signing optional"); - if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) { - cFYI(1, "Server requires signing"); - server->sec_mode |= SECMODE_SIGN_REQUIRED; - } else { - server->sec_mode &= - ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); - } - } else { - cFYI(1, "Signing disabled"); - if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) { - cERROR(1, "Server requires packet signing to be enabled" - " in /proc/fs/cifs/SecurityFlags."); - rc = -EOPNOTSUPP; - goto neg_exit; - } - server->sec_mode &= - ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); - } + /* + * See MS-SMB2 section 2.2.4: if no blob, client picks default which + * for us will be + * ses->sectype = RawNTLMSSP; + * but for time being this is our only auth choice so doesn't matter. + * We just found a server which sets blob length to zero expecting raw. + */ + if (blob_length == 0) + cifs_dbg(FYI, "missing security blob on negprot\n"); + rc = cifs_enable_signing(server, ses->sign); #ifdef CONFIG_SMB2_ASN1 /* BB REMOVEME when updated asn1.c ready */ - rc = decode_neg_token_init(security_blob, blob_length, + if (rc) + goto neg_exit; + if (blob_length) + rc = decode_neg_token_init(security_blob, blob_length, &server->sec_type); if (rc == 1) rc = 0; @@ -481,21 +464,17 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, int rc = 0; int resp_buftype; __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ - struct TCP_Server_Info *server; - unsigned int sec_flags; - u8 temp = 0; + struct TCP_Server_Info *server = ses->server; u16 blob_length = 0; char *security_blob; char *ntlmssp_blob = NULL; bool use_spnego = false; /* else use raw ntlmssp */ - cFYI(1, "Session Setup"); + cifs_dbg(FYI, "Session Setup\n"); - if (ses->server) - server = ses->server; - else { - rc = -EIO; - return rc; + if (!server) { + WARN(1, "%s: server is NULL!\n", __func__); + return -EIO; } /* @@ -506,7 +485,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, if (!ses->ntlmssp) return -ENOMEM; - ses->server->secType = RawNTLMSSP; + /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */ + ses->sectype = RawNTLMSSP; ssetup_ntlmssp_authenticate: if (phase == NtLmChallenge) @@ -516,28 +496,19 @@ ssetup_ntlmssp_authenticate: if (rc) return rc; - /* if any of auth flags (ie not sign or seal) are overriden use them */ - if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) - sec_flags = ses->overrideSecFlg; /* BB FIXME fix sign flags?*/ - else /* if override flags set only sign/seal OR them with global auth */ - sec_flags = global_secflags | ses->overrideSecFlg; - - cFYI(1, "sec_flags 0x%x", sec_flags); - req->hdr.SessionId = 0; /* First session, not a reauthenticate */ req->VcNumber = 0; /* MBZ */ /* to enable echos and oplocks */ req->hdr.CreditRequest = cpu_to_le16(3); /* only one of SMB2 signing flags may be set in SMB2 request */ - if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) - temp = SMB2_NEGOTIATE_SIGNING_REQUIRED; - else if (ses->server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) - temp = SMB2_NEGOTIATE_SIGNING_REQUIRED; - else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */ - temp = SMB2_NEGOTIATE_SIGNING_ENABLED; - - req->SecurityMode = temp; + if (server->sign) + req->SecurityMode = SMB2_NEGOTIATE_SIGNING_REQUIRED; + else if (global_secflags & CIFSSEC_MAY_SIGN) /* one flag unlike MUST_ */ + req->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED; + else + req->SecurityMode = 0; + req->Capabilities = 0; req->Channel = 0; /* MBZ */ @@ -558,7 +529,7 @@ ssetup_ntlmssp_authenticate: sizeof(struct _NEGOTIATE_MESSAGE), ntlmssp_blob); */ /* BB eventually need to add this */ - cERROR(1, "spnego not supported for SMB2 yet"); + cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); rc = -EOPNOTSUPP; kfree(ntlmssp_blob); goto ssetup_exit; @@ -572,14 +543,14 @@ ssetup_ntlmssp_authenticate: ntlmssp_blob = kzalloc(sizeof(struct _NEGOTIATE_MESSAGE) + 500, GFP_KERNEL); if (ntlmssp_blob == NULL) { - cERROR(1, "failed to malloc ntlmssp blob"); rc = -ENOMEM; goto ssetup_exit; } rc = build_ntlmssp_auth_blob(ntlmssp_blob, &blob_length, ses, nls_cp); if (rc) { - cFYI(1, "build_ntlmssp_auth_blob failed %d", rc); + cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", + rc); goto ssetup_exit; /* BB double check error handling */ } if (use_spnego) { @@ -587,7 +558,7 @@ ssetup_ntlmssp_authenticate: &security_blob, blob_length, ntlmssp_blob); */ - cERROR(1, "spnego not supported for SMB2 yet"); + cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); rc = -EOPNOTSUPP; kfree(ntlmssp_blob); goto ssetup_exit; @@ -595,7 +566,7 @@ ssetup_ntlmssp_authenticate: security_blob = ntlmssp_blob; } } else { - cERROR(1, "illegal ntlmssp phase"); + cifs_dbg(VFS, "illegal ntlmssp phase\n"); rc = -EIO; goto ssetup_exit; } @@ -620,13 +591,13 @@ ssetup_ntlmssp_authenticate: if (resp_buftype != CIFS_NO_BUFFER && rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) { if (phase != NtLmNegotiate) { - cERROR(1, "Unexpected more processing error"); + cifs_dbg(VFS, "Unexpected more processing error\n"); goto ssetup_exit; } if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 != le16_to_cpu(rsp->SecurityBufferOffset)) { - cERROR(1, "Invalid security buffer offset %d", - le16_to_cpu(rsp->SecurityBufferOffset)); + cifs_dbg(VFS, "Invalid security buffer offset %d\n", + le16_to_cpu(rsp->SecurityBufferOffset)); rc = -EIO; goto ssetup_exit; } @@ -667,7 +638,7 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) int rc = 0; struct TCP_Server_Info *server; - cFYI(1, "disconnect session %p", ses); + cifs_dbg(FYI, "disconnect session %p\n", ses); if (ses && (ses->server)) server = ses->server; @@ -680,7 +651,7 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) /* since no tcon, smb2_init can not do this, so do here */ req->hdr.SessionId = ses->Suid; - if (server->sec_mode & SECMODE_SIGN_REQUIRED) + if (server->sign) req->hdr.Flags |= SMB2_FLAGS_SIGNED; rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0); @@ -711,7 +682,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, struct TCP_Server_Info *server; __le16 *unc_path = NULL; - cFYI(1, "TCON"); + cifs_dbg(FYI, "TCON\n"); if ((ses->server) && tree) server = ses->server; @@ -775,29 +746,30 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, } if (rsp->ShareType & SMB2_SHARE_TYPE_DISK) - cFYI(1, "connection to disk share"); + cifs_dbg(FYI, "connection to disk share\n"); else if (rsp->ShareType & SMB2_SHARE_TYPE_PIPE) { tcon->ipc = true; - cFYI(1, "connection to pipe share"); + cifs_dbg(FYI, "connection to pipe share\n"); } else if (rsp->ShareType & SMB2_SHARE_TYPE_PRINT) { tcon->print = true; - cFYI(1, "connection to printer"); + cifs_dbg(FYI, "connection to printer\n"); } else { - cERROR(1, "unknown share type %d", rsp->ShareType); + cifs_dbg(VFS, "unknown share type %d\n", rsp->ShareType); rc = -EOPNOTSUPP; goto tcon_error_exit; } tcon->share_flags = le32_to_cpu(rsp->ShareFlags); + tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */ tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); tcon->tidStatus = CifsGood; tcon->need_reconnect = false; tcon->tid = rsp->hdr.TreeId; - strncpy(tcon->treeName, tree, MAX_TREE_SIZE); + strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) - cERROR(1, "DFS capability contradicts DFS flag"); + cifs_dbg(VFS, "DFS capability contradicts DFS flag\n"); tcon_exit: free_rsp_buf(resp_buftype, rsp); @@ -806,7 +778,7 @@ tcon_exit: tcon_error_exit: if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) { - cERROR(1, "BAD_NETWORK_NAME: %s", tree); + cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); tcon->bad_network_name = true; } goto tcon_exit; @@ -820,7 +792,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) struct TCP_Server_Info *server; struct cifs_ses *ses = tcon->ses; - cFYI(1, "Tree Disconnect"); + cifs_dbg(FYI, "Tree Disconnect\n"); if (ses && (ses->server)) server = ses->server; @@ -846,12 +818,10 @@ create_lease_buf(u8 *lease_key, u8 oplock) { struct create_lease *buf; - buf = kmalloc(sizeof(struct create_lease), GFP_KERNEL); + buf = kzalloc(sizeof(struct create_lease), GFP_KERNEL); if (!buf) return NULL; - memset(buf, 0, sizeof(struct create_lease)); - buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key)); buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8))); if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) @@ -925,7 +895,7 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path, int rc = 0; int num_iovecs = 2; - cFYI(1, "create/open"); + cifs_dbg(FYI, "create/open\n"); if (ses && (ses->server)) server = ses->server; @@ -1039,6 +1009,122 @@ creat_exit: return rc; } +/* + * SMB2 IOCTL is used for both IOCTLs and FSCTLs + */ +int +SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, + u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, + u32 indatalen, char **out_data, u32 *plen /* returned data len */) +{ + struct smb2_ioctl_req *req; + struct smb2_ioctl_rsp *rsp; + struct TCP_Server_Info *server; + struct cifs_ses *ses = tcon->ses; + struct kvec iov[2]; + int resp_buftype; + int num_iovecs; + int rc = 0; + + cifs_dbg(FYI, "SMB2 IOCTL\n"); + + /* zero out returned data len, in case of error */ + if (plen) + *plen = 0; + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + rc = small_smb2_init(SMB2_IOCTL, tcon, (void **) &req); + if (rc) + return rc; + + req->CtlCode = cpu_to_le32(opcode); + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; + + if (indatalen) { + req->InputCount = cpu_to_le32(indatalen); + /* do not set InputOffset if no input data */ + req->InputOffset = + cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4); + iov[1].iov_base = in_data; + iov[1].iov_len = indatalen; + num_iovecs = 2; + } else + num_iovecs = 1; + + req->OutputOffset = 0; + req->OutputCount = 0; /* MBZ */ + + /* + * Could increase MaxOutputResponse, but that would require more + * than one credit. Windows typically sets this smaller, but for some + * ioctls it may be useful to allow server to send more. No point + * limiting what the server can send as long as fits in one credit + */ + req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */ + + if (is_fsctl) + req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); + else + req->Flags = 0; + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field */ + iov[0].iov_len = get_rfc1002_length(req) + 4; + + if (indatalen) + inc_rfc1001_len(req, indatalen); + + rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); + rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base; + + if (rc != 0) { + if (tcon) + cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); + goto ioctl_exit; + } + + /* check if caller wants to look at return data or just return rc */ + if ((plen == NULL) || (out_data == NULL)) + goto ioctl_exit; + + *plen = le32_to_cpu(rsp->OutputCount); + + /* We check for obvious errors in the output buffer length and offset */ + if (*plen == 0) + goto ioctl_exit; /* server returned no data */ + else if (*plen > 0xFF00) { + cifs_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen); + *plen = 0; + rc = -EIO; + goto ioctl_exit; + } + + if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) { + cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen, + le32_to_cpu(rsp->OutputOffset)); + *plen = 0; + rc = -EIO; + goto ioctl_exit; + } + + *out_data = kmalloc(*plen, GFP_KERNEL); + if (*out_data == NULL) { + rc = -ENOMEM; + goto ioctl_exit; + } + + memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset), + *plen); +ioctl_exit: + free_rsp_buf(resp_buftype, rsp); + return rc; +} + int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) @@ -1051,7 +1137,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, int resp_buftype; int rc = 0; - cFYI(1, "Close"); + cifs_dbg(FYI, "Close\n"); if (ses && (ses->server)) server = ses->server; @@ -1097,20 +1183,20 @@ validate_buf(unsigned int offset, unsigned int buffer_length, if (buffer_length < min_buf_size) { - cERROR(1, "buffer length %d smaller than minimum size %d", - buffer_length, min_buf_size); + cifs_dbg(VFS, "buffer length %d smaller than minimum size %d\n", + buffer_length, min_buf_size); return -EINVAL; } /* check if beyond RFC1001 maximum length */ if ((smb_len > 0x7FFFFF) || (buffer_length > 0x7FFFFF)) { - cERROR(1, "buffer length %d or smb length %d too large", - buffer_length, smb_len); + cifs_dbg(VFS, "buffer length %d or smb length %d too large\n", + buffer_length, smb_len); return -EINVAL; } if ((begin_of_buf > end_of_smb) || (end_of_buf > end_of_smb)) { - cERROR(1, "illegal server response, bad offset to data"); + cifs_dbg(VFS, "illegal server response, bad offset to data\n"); return -EINVAL; } @@ -1155,7 +1241,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, struct TCP_Server_Info *server; struct cifs_ses *ses = tcon->ses; - cFYI(1, "Query Info"); + cifs_dbg(FYI, "Query Info\n"); if (ses && (ses->server)) server = ses->server; @@ -1247,7 +1333,7 @@ SMB2_echo(struct TCP_Server_Info *server) struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 }; - cFYI(1, "In echo request"); + cifs_dbg(FYI, "In echo request\n"); rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req); if (rc) @@ -1262,7 +1348,7 @@ SMB2_echo(struct TCP_Server_Info *server) rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, server, CIFS_ECHO_OP); if (rc) - cFYI(1, "Echo request failed: %d", rc); + cifs_dbg(FYI, "Echo request failed: %d\n", rc); cifs_small_buf_release(req); return rc; @@ -1279,7 +1365,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, int resp_buftype; int rc = 0; - cFYI(1, "Flush"); + cifs_dbg(FYI, "Flush\n"); if (ses && (ses->server)) server = ses->server; @@ -1379,21 +1465,21 @@ smb2_readv_callback(struct mid_q_entry *mid) .rq_pagesz = rdata->pagesz, .rq_tailsz = rdata->tailsz }; - cFYI(1, "%s: mid=%llu state=%d result=%d bytes=%u", __func__, - mid->mid, mid->mid_state, rdata->result, rdata->bytes); + cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n", + __func__, mid->mid, mid->mid_state, rdata->result, + rdata->bytes); switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: credits_received = le16_to_cpu(buf->CreditRequest); /* result already set, check signature */ - if (server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + if (server->sign) { int rc; rc = smb2_verify_signature(&rqst, server); if (rc) - cERROR(1, "SMB signature verification returned " - "error = %d", rc); + cifs_dbg(VFS, "SMB signature verification returned error = %d\n", + rc); } /* FIXME: should this be counted toward the initiating task? */ task_io_account_read(rdata->bytes); @@ -1426,8 +1512,8 @@ smb2_async_readv(struct cifs_readdata *rdata) struct smb_rqst rqst = { .rq_iov = &rdata->iov, .rq_nvec = 1 }; - cFYI(1, "%s: offset=%llu bytes=%u", __func__, - rdata->offset, rdata->bytes); + cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", + __func__, rdata->offset, rdata->bytes); io_parms.tcon = tlink_tcon(rdata->cfile->tlink); io_parms.offset = rdata->offset; @@ -1481,13 +1567,13 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, if (rc) { cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); - cERROR(1, "Send error in read = %d", rc); + cifs_dbg(VFS, "Send error in read = %d\n", rc); } else { *nbytes = le32_to_cpu(rsp->DataLength); if ((*nbytes > CIFS_MAX_MSGSIZE) || (*nbytes > io_parms->length)) { - cFYI(1, "bad length %d for count %d", *nbytes, - io_parms->length); + cifs_dbg(FYI, "bad length %d for count %d\n", + *nbytes, io_parms->length); rc = -EIO; *nbytes = 0; } @@ -1597,7 +1683,8 @@ smb2_async_writev(struct cifs_writedata *wdata) rqst.rq_pagesz = wdata->pagesz; rqst.rq_tailsz = wdata->tailsz; - cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes); + cifs_dbg(FYI, "async write at %llu %u bytes\n", + wdata->offset, wdata->bytes); req->Length = cpu_to_le32(wdata->bytes); @@ -1670,7 +1757,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, if (rc) { cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE); - cERROR(1, "Send error in write = %d", rc); + cifs_dbg(VFS, "Send error in write = %d\n", rc); } else *nbytes = le32_to_cpu(rsp->DataLength); @@ -1696,14 +1783,14 @@ num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size) ((char *)entryptr + next_offset); if ((char *)entryptr + size > end_of_buf) { - cERROR(1, "malformed search entry would overflow"); + cifs_dbg(VFS, "malformed search entry would overflow\n"); break; } len = le32_to_cpu(entryptr->FileNameLength); if ((char *)entryptr + len + size > end_of_buf) { - cERROR(1, "directory entry name would overflow frame " - "end of buf %p", end_of_buf); + cifs_dbg(VFS, "directory entry name would overflow frame end of buf %p\n", + end_of_buf); break; } @@ -1759,8 +1846,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1; break; default: - cERROR(1, "info level %u isn't supported", - srch_inf->info_level); + cifs_dbg(VFS, "info level %u isn't supported\n", + srch_inf->info_level); rc = -EINVAL; goto qdir_exit; } @@ -1824,15 +1911,15 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, num_entries(srch_inf->srch_entries_start, end_of_smb, &srch_inf->last_entry, info_buf_size); srch_inf->index_of_last_entry += srch_inf->entries_in_buffer; - cFYI(1, "num entries %d last_index %lld srch start %p srch end %p", - srch_inf->entries_in_buffer, srch_inf->index_of_last_entry, - srch_inf->srch_entries_start, srch_inf->last_entry); + cifs_dbg(FYI, "num entries %d last_index %lld srch start %p srch end %p\n", + srch_inf->entries_in_buffer, srch_inf->index_of_last_entry, + srch_inf->srch_entries_start, srch_inf->last_entry); if (resp_buftype == CIFS_LARGE_BUFFER) srch_inf->smallBuf = false; else if (resp_buftype == CIFS_SMALL_BUFFER) srch_inf->smallBuf = true; else - cERROR(1, "illegal search buffer type"); + cifs_dbg(VFS, "illegal search buffer type\n"); if (rsp->hdr.Status == STATUS_NO_MORE_FILES) srch_inf->endOfSearch = 1; @@ -2017,7 +2104,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_oplock_break *req = NULL; - cFYI(1, "SMB2_oplock_break"); + cifs_dbg(FYI, "SMB2_oplock_break\n"); rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req); if (rc) @@ -2033,7 +2120,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); - cFYI(1, "Send error in Oplock Break = %d", rc); + cifs_dbg(FYI, "Send error in Oplock Break = %d\n", rc); } return rc; @@ -2058,7 +2145,7 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, int rc; struct smb2_query_info_req *req; - cFYI(1, "Query FSInfo level %d", level); + cifs_dbg(FYI, "Query FSInfo level %d\n", level); if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) return -EIO; @@ -2131,7 +2218,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, int resp_buf_type; unsigned int count; - cFYI(1, "smb2_lockv num lock %d", num_lock); + cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock); rc = small_smb2_init(SMB2_LOCK, tcon, (void **) &req); if (rc) @@ -2155,7 +2242,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP); if (rc) { - cFYI(1, "Send error in smb2_lockv = %d", rc); + cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc); cifs_stats_fail_inc(tcon, SMB2_LOCK_HE); } @@ -2186,7 +2273,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_lease_ack *req = NULL; - cFYI(1, "SMB2_lease_break"); + cifs_dbg(FYI, "SMB2_lease_break\n"); rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req); if (rc) @@ -2204,7 +2291,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); - cFYI(1, "Send error in Lease Break = %d", rc); + cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc); } return rc; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 4cb4ced258cb..f31043b26bd3 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -1,7 +1,7 @@ /* * fs/cifs/smb2pdu.h * - * Copyright (c) International Business Machines Corp., 2009, 2010 + * Copyright (c) International Business Machines Corp., 2009, 2013 * Etersoft, 2012 * Author(s): Steve French (sfrench@us.ibm.com) * Pavel Shilovsky (pshilovsky@samba.org) 2012 @@ -170,6 +170,7 @@ struct smb2_negotiate_req { #define SMB20_PROT_ID 0x0202 #define SMB21_PROT_ID 0x0210 #define SMB30_PROT_ID 0x0300 +#define SMB302_PROT_ID 0x0302 #define BAD_PROT_ID 0xFFFF /* SecurityMode flags */ @@ -283,10 +284,17 @@ struct smb2_tree_connect_rsp { #define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400 #define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 #define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000 -#define SHI1005_FLAGS_ENABLE_HASH 0x00002000 +#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000 +#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000 +#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000 +#define SHI1005_FLAGS_ALL 0x0000FF33 /* Possible share capabilities */ -#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) +#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */ +#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */ +#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */ +#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */ +#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */ struct smb2_tree_disconnect_req { struct smb2_hdr hdr; @@ -477,6 +485,75 @@ struct create_lease { struct lease_context lcontext; } __packed; +/* this goes in the ioctl buffer when doing a copychunk request */ +struct copychunk_ioctl { + char SourceKey[24]; + __le32 ChunkCount; /* we are only sending 1 */ + __le32 Reserved; + /* array will only be one chunk long for us */ + __le64 SourceOffset; + __le64 TargetOffset; + __le32 Length; /* how many bytes to copy */ + __u32 Reserved2; +} __packed; + +/* Response and Request are the same format */ +struct validate_negotiate_info { + __le32 Capabilities; + __u8 Guid[SMB2_CLIENT_GUID_SIZE]; + __le16 SecurityMode; + __le16 DialectCount; + __le16 Dialect[1]; +} __packed; + +#define RSS_CAPABLE 0x00000001 +#define RDMA_CAPABLE 0x00000002 + +struct network_interface_info_ioctl_rsp { + __le32 Next; /* next interface. zero if this is last one */ + __le32 IfIndex; + __le32 Capability; /* RSS or RDMA Capable */ + __le32 Reserved; + __le64 LinkSpeed; + char SockAddr_Storage[128]; +} __packed; + +#define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */ + +struct smb2_ioctl_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 57 */ + __u16 Reserved; + __le32 CtlCode; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __le32 InputOffset; + __le32 InputCount; + __le32 MaxInputResponse; + __le32 OutputOffset; + __le32 OutputCount; + __le32 MaxOutputResponse; + __le32 Flags; + __u32 Reserved2; + char Buffer[0]; +} __packed; + +struct smb2_ioctl_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 57 */ + __u16 Reserved; + __le32 CtlCode; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __le32 InputOffset; + __le32 InputCount; + __le32 OutputOffset; + __le32 OutputCount; + __le32 Flags; + __u32 Reserved2; + /* char * buffer[] */ +} __packed; + /* Currently defined values for close flags */ #define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) struct smb2_close_req { @@ -517,17 +594,25 @@ struct smb2_flush_rsp { __le16 Reserved; } __packed; +/* For read request Flags field below, following flag is defined for SMB3.02 */ +#define SMB2_READFLAG_READ_UNBUFFERED 0x01 + +/* Channel field for read and write: exactly one of following flags can be set*/ +#define SMB2_CHANNEL_NONE 0x00000000 +#define SMB2_CHANNEL_RDMA_V1 0x00000001 /* SMB3 or later */ +#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000001 /* SMB3.02 or later */ + struct smb2_read_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 49 */ __u8 Padding; /* offset from start of SMB2 header to place read */ - __u8 Reserved; + __u8 Flags; /* MBZ unless SMB3.02 or later */ __le32 Length; __le64 Offset; __u64 PersistentFileId; /* opaque endianness */ __u64 VolatileFileId; /* opaque endianness */ __le32 MinimumCount; - __le32 Channel; /* Reserved MBZ */ + __le32 Channel; /* MBZ except for SMB3 or later */ __le32 RemainingBytes; __le16 ReadChannelInfoOffset; /* Reserved MBZ */ __le16 ReadChannelInfoLength; /* Reserved MBZ */ @@ -545,8 +630,9 @@ struct smb2_read_rsp { __u8 Buffer[1]; } __packed; -/* For write request Flags field below the following flag is defined: */ -#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 +/* For write request Flags field below the following flags are defined: */ +#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */ +#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */ struct smb2_write_req { struct smb2_hdr hdr; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 2aa3535e38ce..d4e1eb807457 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -111,6 +111,10 @@ extern int SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __u32 desired_access, __u32 create_disposition, __u32 file_attributes, __u32 create_options, __u8 *oplock, struct smb2_file_all_info *buf); +extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, u32 opcode, + bool is_fsctl, char *in_data, u32 indatalen, + char **out_data, u32 *plen /* returned data len */); extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 8dd73e61d762..09b4fbaadeb6 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -55,13 +55,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) rc = crypto_shash_setkey(server->secmech.hmacsha256, server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { - cERROR(1, "%s: Could not update with response\n", __func__); + cifs_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; } rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); if (rc) { - cERROR(1, "%s: Could not init md5\n", __func__); + cifs_dbg(VFS, "%s: Could not init md5\n", __func__); return rc; } @@ -69,7 +69,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) if (iov[i].iov_len == 0) continue; if (iov[i].iov_base == NULL) { - cERROR(1, "null iovec entry"); + cifs_dbg(VFS, "null iovec entry\n"); return -EIO; } /* @@ -90,8 +90,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) iov[i].iov_base, iov[i].iov_len); } if (rc) { - cERROR(1, "%s: Could not update with payload\n", - __func__); + cifs_dbg(VFS, "%s: Could not update with payload\n", + __func__); return rc; } } @@ -109,18 +109,162 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash, sigptr); if (rc) - cERROR(1, "%s: Could not generate sha256 hash\n", __func__); + cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE); return rc; } +void +generate_smb3signingkey(struct TCP_Server_Info *server) +{ + unsigned char zero = 0x0; + __u8 i[4] = {0, 0, 0, 1}; + __u8 L[4] = {0, 0, 0, 128}; + int rc = 0; + unsigned char prfhash[SMB2_HMACSHA256_SIZE]; + unsigned char *hashptr = prfhash; + + memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE); + memset(server->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE); + + rc = crypto_shash_setkey(server->secmech.hmacsha256, + server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE); + if (rc) { + cifs_dbg(VFS, "%s: Could not set with session key\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); + if (rc) { + cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + i, 4); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with n\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + "SMB2AESCMAC", 12); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with label\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + &zero, 1); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with zero\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + "SmbSign", 8); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with context\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + L, 4); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with L\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash, + hashptr); + if (rc) { + cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); + goto smb3signkey_ret; + } + + memcpy(server->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE); + +smb3signkey_ret: + return; +} + int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { - cFYI(1, "smb3 signatures not supported yet"); - return -EOPNOTSUPP; + int i, rc; + unsigned char smb3_signature[SMB2_CMACAES_SIZE]; + unsigned char *sigptr = smb3_signature; + struct kvec *iov = rqst->rq_iov; + int n_vec = rqst->rq_nvec; + struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; + + memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE); + memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE); + + rc = crypto_shash_setkey(server->secmech.cmacaes, + server->smb3signingkey, SMB2_CMACAES_SIZE); + if (rc) { + cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); + return rc; + } + + rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash); + if (rc) { + cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__); + return rc; + } + + for (i = 0; i < n_vec; i++) { + if (iov[i].iov_len == 0) + continue; + if (iov[i].iov_base == NULL) { + cifs_dbg(VFS, "null iovec entry"); + return -EIO; + } + /* + * The first entry includes a length field (which does not get + * signed that occupies the first 4 bytes before the header). + */ + if (i == 0) { + if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ + break; /* nothing to sign or corrupt header */ + rc = + crypto_shash_update( + &server->secmech.sdesccmacaes->shash, + iov[i].iov_base + 4, iov[i].iov_len - 4); + } else { + rc = + crypto_shash_update( + &server->secmech.sdesccmacaes->shash, + iov[i].iov_base, iov[i].iov_len); + } + if (rc) { + cifs_dbg(VFS, "%s: Couldn't update cmac aes with payload\n", + __func__); + return rc; + } + } + + /* now hash over the rq_pages array */ + for (i = 0; i < rqst->rq_npages; i++) { + struct kvec p_iov; + + cifs_rqst_page_to_kvec(rqst, i, &p_iov); + crypto_shash_update(&server->secmech.sdesccmacaes->shash, + p_iov.iov_base, p_iov.iov_len); + kunmap(rqst->rq_pages[i]); + } + + rc = crypto_shash_final(&server->secmech.sdesccmacaes->shash, + sigptr); + if (rc) + cifs_dbg(VFS, "%s: Could not generate cmac aes\n", __func__); + + memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE); + + return rc; } /* must be called with server->srv_mutex held */ @@ -163,8 +307,8 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) /* Do not need to verify session setups with signature "BSRSPYL " */ if (memcmp(smb2_pdu->Signature, "BSRSPYL ", 8) == 0) - cFYI(1, "dummy signature received for smb command 0x%x", - smb2_pdu->Command); + cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n", + smb2_pdu->Command); /* * Save off the origiginal signature so we can modify the smb and check @@ -205,7 +349,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer, struct mid_q_entry *temp; if (server == NULL) { - cERROR(1, "Null TCP session in smb2_mid_entry_alloc"); + cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n"); return NULL; } @@ -241,7 +385,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf, return -ENOENT; if (ses->server->tcpStatus == CifsNeedReconnect) { - cFYI(1, "tcp session dead - return to caller to retry"); + cifs_dbg(FYI, "tcp session dead - return to caller to retry\n"); return -EAGAIN; } @@ -275,14 +419,13 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, dump_smb(mid->resp_buf, min_t(u32, 80, len)); /* convert the length into a more usable form */ - if ((len > 24) && - (server->sec_mode & (SECMODE_SIGN_REQUIRED|SECMODE_SIGN_ENABLED))) { + if (len > 24 && server->sign) { int rc; rc = smb2_verify_signature(&rqst, server); if (rc) - cERROR(1, "SMB signature verification returned error = " - "%d", rc); + cifs_dbg(VFS, "SMB signature verification returned error = %d\n", + rc); } return map_smb2_to_linux_error(mid->resp_buf, log_error); diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index a0a58fbe2c10..43eb1367b103 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -78,7 +78,7 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_des)) { rc = PTR_ERR(tfm_des); - cERROR(1, "could not allocate des crypto API"); + cifs_dbg(VFS, "could not allocate des crypto API\n"); goto smbhash_err; } @@ -91,7 +91,7 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8); if (rc) - cERROR(1, "could not encrypt crypt key rc: %d", rc); + cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc); crypto_free_blkcipher(tfm_des); smbhash_err: @@ -139,14 +139,14 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) md4 = crypto_alloc_shash("md4", 0, 0); if (IS_ERR(md4)) { rc = PTR_ERR(md4); - cERROR(1, "%s: Crypto md4 allocation error %d", __func__, rc); + cifs_dbg(VFS, "%s: Crypto md4 allocation error %d\n", + __func__, rc); return rc; } size = sizeof(struct shash_desc) + crypto_shash_descsize(md4); sdescmd4 = kmalloc(size, GFP_KERNEL); if (!sdescmd4) { rc = -ENOMEM; - cERROR(1, "%s: Memory allocation failure", __func__); goto mdfour_err; } sdescmd4->shash.tfm = md4; @@ -154,17 +154,17 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) rc = crypto_shash_init(&sdescmd4->shash); if (rc) { - cERROR(1, "%s: Could not init md4 shash", __func__); + cifs_dbg(VFS, "%s: Could not init md4 shash\n", __func__); goto mdfour_err; } rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len); if (rc) { - cERROR(1, "%s: Could not update with link_str", __func__); + cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__); goto mdfour_err; } rc = crypto_shash_final(&sdescmd4->shash, md4_hash); if (rc) - cERROR(1, "%s: Could not genereate md4 hash", __func__); + cifs_dbg(VFS, "%s: Could not generate md4 hash\n", __func__); mdfour_err: crypto_free_shash(md4); @@ -238,7 +238,8 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24, rc = E_md4hash(passwd, p16, codepage); if (rc) { - cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); + cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n", + __func__, rc); return rc; } memcpy(p21, p16, 16); diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index 7056b891e087..d952ee48f4dc 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -1,7 +1,7 @@ /* * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions * - * Copyright (c) International Business Machines Corp., 2002,2009 + * Copyright (c) International Business Machines Corp., 2002,2013 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -22,7 +22,7 @@ /* IOCTL information */ /* * List of ioctl/fsctl function codes that are or could be useful in the - * future to remote clients like cifs or SMB2 client. There is probably + * future to remote clients like cifs or SMB2/SMB3 client. This is probably * a slightly larger set of fsctls that NTFS local filesystem could handle, * including the seven below that we do not have struct definitions for. * Even with protocol definitions for most of these now available, we still @@ -30,7 +30,13 @@ * remotely. Some of the following, such as the encryption/compression ones * could be invoked from tools via a specialized hook into the VFS rather * than via the standard vfs entry points + * + * See MS-SMB2 Section 2.2.31 (last checked June 2013, all of that list are + * below). Additional detail on less common ones can be found in MS-FSCC + * section 2.3. */ +#define FSCTL_DFS_GET_REFERRALS 0x00060194 +#define FSCTL_DFS_GET_REFERRALS_EX 0x000601B0 #define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000 #define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004 #define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008 @@ -71,14 +77,31 @@ #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */ #define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ +#define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */ #define FSCTL_SIS_LINK_FILES 0x0009C104 #define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */ #define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */ /* strange that the number for this op is not sequential with previous op */ #define FSCTL_PIPE_WAIT 0x00110018 /* BB add struct */ +/* Enumerate previous versions of a file */ +#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064 +/* Retrieve an opaque file reference for server-side data movement ie copy */ +#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078 +#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */ #define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */ #define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */ +#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* BB add struct */ +/* Perform server-side data movement */ +#define FSCTL_SRV_COPYCHUNK 0x001440F2 +#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2 +#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */ +#define FSCTL_SRV_READ_HASH 0x001441BB /* BB add struct */ #define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003 #define IO_REPARSE_TAG_HSM 0xC0000004 #define IO_REPARSE_TAG_SIS 0x80000007 + +/* fsctl flags */ +/* If Flags is set to this value, the request is an FSCTL not ioctl request */ +#define SMB2_0_IOCTL_IS_FSCTL 0x00000001 + diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 1a528680ec5a..6fdcb1b4a106 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -49,7 +49,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) struct mid_q_entry *temp; if (server == NULL) { - cERROR(1, "Null TCP session in AllocMidQEntry"); + cifs_dbg(VFS, "Null TCP session in AllocMidQEntry\n"); return NULL; } @@ -61,7 +61,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) temp->mid = smb_buffer->Mid; /* always LE */ temp->pid = current->pid; temp->command = cpu_to_le16(smb_buffer->Command); - cFYI(1, "For smb_command %d", smb_buffer->Command); + cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command); /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ /* when mid allocated can be before when sent */ temp->when_alloc = jiffies; @@ -179,17 +179,11 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec, */ rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec], n_vec - first_vec, remaining); - if (rc == -ENOSPC || rc == -EAGAIN) { - /* - * Catch if a low level driver returns -ENOSPC. This - * WARN_ON will be removed by 3.10 if no one reports - * seeing this. - */ - WARN_ON_ONCE(rc == -ENOSPC); + if (rc == -EAGAIN) { i++; if (i >= 14 || (!server->noblocksnd && (i > 2))) { - cERROR(1, "sends on sock %p stuck for 15 " - "seconds", ssocket); + cifs_dbg(VFS, "sends on sock %p stuck for 15 seconds\n", + ssocket); rc = -EAGAIN; break; } @@ -209,14 +203,14 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec, } if (rc > remaining) { - cERROR(1, "sent %d requested %d", rc, remaining); + cifs_dbg(VFS, "sent %d requested %d\n", rc, remaining); break; } if (rc == 0) { /* should never happen, letting socket clear before retrying is our only obvious option here */ - cERROR(1, "tcp sent no data"); + cifs_dbg(VFS, "tcp sent no data\n"); msleep(500); continue; } @@ -291,7 +285,7 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) if (ssocket == NULL) return -ENOTSOCK; - cFYI(1, "Sending smb: smb_len=%u", smb_buf_length); + cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length); dump_smb(iov[0].iov_base, iov[0].iov_len); /* cork the socket */ @@ -324,8 +318,8 @@ uncork: (char *)&val, sizeof(val)); if ((total_len > 0) && (total_len != smb_buf_length + 4)) { - cFYI(1, "partial send (wanted=%u sent=%zu): terminating " - "session", smb_buf_length + 4, total_len); + cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n", + smb_buf_length + 4, total_len); /* * If we have only sent part of an SMB then the next SMB could * be taken as the remainder of this one. We need to kill the @@ -335,7 +329,8 @@ uncork: } if (rc < 0 && rc != -EINTR) - cERROR(1, "Error %d sending data on socket to server", rc); + cifs_dbg(VFS, "Error %d sending data on socket to server\n", + rc); else rc = 0; @@ -427,7 +422,7 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, } if (ses->server->tcpStatus == CifsNeedReconnect) { - cFYI(1, "tcp session dead - return to caller to retry"); + cifs_dbg(FYI, "tcp session dead - return to caller to retry\n"); return -EAGAIN; } @@ -452,7 +447,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) { int error; - error = wait_event_freezekillable(server->response_q, + error = wait_event_freezekillable_unsafe(server->response_q, midQ->mid_state != MID_REQUEST_SUBMITTED); if (error < 0) return -ERESTARTSYS; @@ -468,7 +463,7 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) struct mid_q_entry *mid; /* enable signing if server requires it */ - if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + if (server->sign) hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; mid = AllocMidQEntry(hdr, server); @@ -527,6 +522,9 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, rc = smb_send_rqst(server, rqst); cifs_in_send_dec(server); cifs_save_when_sent(mid); + + if (rc < 0) + server->sequence_number -= 2; mutex_unlock(&server->srv_mutex); if (rc == 0) @@ -559,7 +557,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses, iov[0].iov_len = get_rfc1002_length(in_buf) + 4; flags |= CIFS_NO_RESP; rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags); - cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc); + cifs_dbg(NOISY, "SendRcvNoRsp flags %d rc %d\n", flags, rc); return rc; } @@ -569,8 +567,8 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) { int rc = 0; - cFYI(1, "%s: cmd=%d mid=%llu state=%d", __func__, - le16_to_cpu(mid->command), mid->mid, mid->mid_state); + cifs_dbg(FYI, "%s: cmd=%d mid=%llu state=%d\n", + __func__, le16_to_cpu(mid->command), mid->mid, mid->mid_state); spin_lock(&GlobalMid_Lock); switch (mid->mid_state) { @@ -588,8 +586,8 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) break; default: list_del_init(&mid->qhead); - cERROR(1, "%s: invalid mid state mid=%llu state=%d", __func__, - mid->mid, mid->mid_state); + cifs_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n", + __func__, mid->mid, mid->mid_state); rc = -EIO; } spin_unlock(&GlobalMid_Lock); @@ -614,7 +612,7 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, dump_smb(mid->resp_buf, min_t(u32, 92, len)); /* convert the length into a more usable form */ - if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + if (server->sign) { struct kvec iov; int rc = 0; struct smb_rqst rqst = { .rq_iov = &iov, @@ -624,10 +622,10 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, iov.iov_len = len; /* FIXME: add code to kill session */ rc = cifs_verify_signature(&rqst, server, - mid->sequence_number + 1); + mid->sequence_number); if (rc) - cERROR(1, "SMB signature verification returned error = " - "%d", rc); + cifs_dbg(VFS, "SMB signature verification returned error = %d\n", + rc); } /* BB special case reconnect tid and uid here? */ @@ -672,7 +670,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, if ((ses == NULL) || (ses->server == NULL)) { cifs_small_buf_release(buf); - cERROR(1, "Null session"); + cifs_dbg(VFS, "Null session\n"); return -EIO; } @@ -716,6 +714,8 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, cifs_in_send_dec(ses->server); cifs_save_when_sent(midQ); + if (rc < 0) + ses->server->sequence_number -= 2; mutex_unlock(&ses->server->srv_mutex); if (rc < 0) { @@ -752,7 +752,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, if (!midQ->resp_buf || midQ->mid_state != MID_RESPONSE_RECEIVED) { rc = -EIO; - cFYI(1, "Bad MID state?"); + cifs_dbg(FYI, "Bad MID state?\n"); goto out; } @@ -788,11 +788,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, struct mid_q_entry *midQ; if (ses == NULL) { - cERROR(1, "Null smb session"); + cifs_dbg(VFS, "Null smb session\n"); return -EIO; } if (ses->server == NULL) { - cERROR(1, "Null tcp session"); + cifs_dbg(VFS, "Null tcp session\n"); return -EIO; } @@ -805,8 +805,8 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - cERROR(1, "Illegal length, greater than maximum frame, %d", - be32_to_cpu(in_buf->smb_buf_length)); + cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n", + be32_to_cpu(in_buf->smb_buf_length)); return -EIO; } @@ -840,6 +840,10 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); cifs_in_send_dec(ses->server); cifs_save_when_sent(midQ); + + if (rc < 0) + ses->server->sequence_number -= 2; + mutex_unlock(&ses->server->srv_mutex); if (rc < 0) @@ -871,7 +875,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (!midQ->resp_buf || !out_buf || midQ->mid_state != MID_RESPONSE_RECEIVED) { rc = -EIO; - cERROR(1, "Bad MID state?"); + cifs_dbg(VFS, "Bad MID state?\n"); goto out; } @@ -921,13 +925,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_ses *ses; if (tcon == NULL || tcon->ses == NULL) { - cERROR(1, "Null smb session"); + cifs_dbg(VFS, "Null smb session\n"); return -EIO; } ses = tcon->ses; if (ses->server == NULL) { - cERROR(1, "Null tcp session"); + cifs_dbg(VFS, "Null tcp session\n"); return -EIO; } @@ -940,8 +944,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - cERROR(1, "Illegal length, greater than maximum frame, %d", - be32_to_cpu(in_buf->smb_buf_length)); + cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n", + be32_to_cpu(in_buf->smb_buf_length)); return -EIO; } @@ -973,6 +977,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); cifs_in_send_dec(ses->server); cifs_save_when_sent(midQ); + + if (rc < 0) + ses->server->sequence_number -= 2; + mutex_unlock(&ses->server->srv_mutex); if (rc < 0) { @@ -1038,7 +1046,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, /* rcvd frame is ok */ if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) { rc = -EIO; - cERROR(1, "Bad MID state?"); + cifs_dbg(VFS, "Bad MID state?\n"); goto out; } diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 5142f2c60278..09afda4cc58e 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -68,12 +68,12 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) goto remove_ea_exit; } if (ea_name == NULL) { - cFYI(1, "Null xattr names not supported"); + cifs_dbg(FYI, "Null xattr names not supported\n"); } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) { - cFYI(1, - "illegal xattr request %s (only user namespace supported)", - ea_name); + cifs_dbg(FYI, + "illegal xattr request %s (only user namespace supported)\n", + ea_name); /* BB what if no namespace prefix? */ /* Should we just pass them to server, except for system and perhaps security prefixes? */ @@ -134,19 +134,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, search server for EAs or streams to returns as xattrs */ if (value_size > MAX_EA_VALUE_SIZE) { - cFYI(1, "size of EA value too large"); + cifs_dbg(FYI, "size of EA value too large\n"); rc = -EOPNOTSUPP; goto set_ea_exit; } if (ea_name == NULL) { - cFYI(1, "Null xattr names not supported"); + cifs_dbg(FYI, "Null xattr names not supported\n"); } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto set_ea_exit; if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) - cFYI(1, "attempt to set cifs inode metadata"); + cifs_dbg(FYI, "attempt to set cifs inode metadata\n"); ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, @@ -167,8 +167,6 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, struct cifs_ntsd *pacl; pacl = kmalloc(value_size, GFP_KERNEL); if (!pacl) { - cFYI(1, "%s: Can't allocate memory for ACL", - __func__); rc = -ENOMEM; } else { memcpy(pacl, ea_value, value_size); @@ -179,7 +177,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, kfree(pacl); } #else - cFYI(1, "Set CIFS ACL not supported yet"); + cifs_dbg(FYI, "Set CIFS ACL not supported yet\n"); #endif /* CONFIG_CIFS_ACL */ } else { int temp; @@ -193,9 +191,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, ACL_TYPE_ACCESS, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - cFYI(1, "set POSIX ACL rc %d", rc); + cifs_dbg(FYI, "set POSIX ACL rc %d\n", rc); #else - cFYI(1, "set POSIX ACL not supported"); + cifs_dbg(FYI, "set POSIX ACL not supported\n"); #endif } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { @@ -206,13 +204,13 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, ACL_TYPE_DEFAULT, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - cFYI(1, "set POSIX default ACL rc %d", rc); + cifs_dbg(FYI, "set POSIX default ACL rc %d\n", rc); #else - cFYI(1, "set default POSIX ACL not supported"); + cifs_dbg(FYI, "set default POSIX ACL not supported\n"); #endif } else { - cFYI(1, "illegal xattr request %s (only user namespace" - " supported)", ea_name); + cifs_dbg(FYI, "illegal xattr request %s (only user namespace supported)\n", + ea_name); /* BB what if no namespace prefix? */ /* Should we just pass them to server, except for system and perhaps security prefixes? */ @@ -263,14 +261,14 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, /* return dos attributes as pseudo xattr */ /* return alt name if available as pseudo attr */ if (ea_name == NULL) { - cFYI(1, "Null xattr names not supported"); + cifs_dbg(FYI, "Null xattr names not supported\n"); } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto get_ea_exit; if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) { - cFYI(1, "attempt to query cifs inode metadata"); + cifs_dbg(FYI, "attempt to query cifs inode metadata\n"); /* revalidate/getattr then populate from inode */ } /* BB add else when above is implemented */ ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ @@ -295,7 +293,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); #else - cFYI(1, "Query POSIX ACL not supported yet"); + cifs_dbg(FYI, "Query POSIX ACL not supported yet\n"); #endif /* CONFIG_CIFS_POSIX */ } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { @@ -307,7 +305,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); #else - cFYI(1, "Query POSIX default ACL not supported yet"); + cifs_dbg(FYI, "Query POSIX default ACL not supported yet\n"); #endif /* CONFIG_CIFS_POSIX */ } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, strlen(CIFS_XATTR_CIFS_ACL)) == 0) { @@ -319,8 +317,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, full_path, &acllen); if (IS_ERR(pacl)) { rc = PTR_ERR(pacl); - cERROR(1, "%s: error %zd getting sec desc", - __func__, rc); + cifs_dbg(VFS, "%s: error %zd getting sec desc\n", + __func__, rc); } else { if (ea_value) { if (acllen > buf_size) @@ -332,18 +330,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, kfree(pacl); } #else - cFYI(1, "Query CIFS ACL not supported yet"); + cifs_dbg(FYI, "Query CIFS ACL not supported yet\n"); #endif /* CONFIG_CIFS_ACL */ } else if (strncmp(ea_name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { - cFYI(1, "Trusted xattr namespace not supported yet"); + cifs_dbg(FYI, "Trusted xattr namespace not supported yet\n"); } else if (strncmp(ea_name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { - cFYI(1, "Security xattr namespace not supported yet"); + cifs_dbg(FYI, "Security xattr namespace not supported yet\n"); } else - cFYI(1, - "illegal xattr request %s (only user namespace supported)", - ea_name); + cifs_dbg(FYI, + "illegal xattr request %s (only user namespace supported)\n", + ea_name); /* We could add an additional check for streams ie if proc/fs/cifs/streamstoxattr is set then diff --git a/fs/coda/dir.c b/fs/coda/dir.c index b7d3a05c062c..190effc6a6fa 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -43,15 +43,14 @@ static int coda_rename(struct inode *old_inode, struct dentry *old_dentry, struct inode *new_inode, struct dentry *new_dentry); /* dir file-ops */ -static int coda_readdir(struct file *file, void *buf, filldir_t filldir); +static int coda_readdir(struct file *file, struct dir_context *ctx); /* dentry ops */ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags); static int coda_dentry_delete(const struct dentry *); /* support routines */ -static int coda_venus_readdir(struct file *coda_file, void *buf, - filldir_t filldir); +static int coda_venus_readdir(struct file *, struct dir_context *); /* same as fs/bad_inode.c */ static int coda_return_EIO(void) @@ -85,7 +84,7 @@ const struct inode_operations coda_dir_inode_operations = const struct file_operations coda_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = coda_readdir, + .iterate = coda_readdir, .open = coda_open, .release = coda_release, .fsync = coda_fsync, @@ -378,7 +377,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, /* file operations for directories */ -static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir) +static int coda_readdir(struct file *coda_file, struct dir_context *ctx) { struct coda_file_info *cfi; struct file *host_file; @@ -391,30 +390,19 @@ static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir) if (!host_file->f_op) return -ENOTDIR; - if (host_file->f_op->readdir) - { - /* potemkin case: we were handed a directory inode. - * We can't use vfs_readdir because we have to keep the file - * position in sync between the coda_file and the host_file. - * and as such we need grab the inode mutex. */ + if (host_file->f_op->iterate) { struct inode *host_inode = file_inode(host_file); - mutex_lock(&host_inode->i_mutex); - host_file->f_pos = coda_file->f_pos; - ret = -ENOENT; if (!IS_DEADDIR(host_inode)) { - ret = host_file->f_op->readdir(host_file, buf, filldir); + ret = host_file->f_op->iterate(host_file, ctx); file_accessed(host_file); } - - coda_file->f_pos = host_file->f_pos; mutex_unlock(&host_inode->i_mutex); + return ret; } - else /* Venus: we must read Venus dirents from a file */ - ret = coda_venus_readdir(coda_file, buf, filldir); - - return ret; + /* Venus: we must read Venus dirents from a file */ + return coda_venus_readdir(coda_file, ctx); } static inline unsigned int CDT2DT(unsigned char cdt) @@ -437,10 +425,8 @@ static inline unsigned int CDT2DT(unsigned char cdt) } /* support routines */ -static int coda_venus_readdir(struct file *coda_file, void *buf, - filldir_t filldir) +static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx) { - int result = 0; /* # of entries returned */ struct coda_file_info *cfi; struct coda_inode_info *cii; struct file *host_file; @@ -462,23 +448,12 @@ static int coda_venus_readdir(struct file *coda_file, void *buf, vdir = kmalloc(sizeof(*vdir), GFP_KERNEL); if (!vdir) return -ENOMEM; - if (coda_file->f_pos == 0) { - ret = filldir(buf, ".", 1, 0, de->d_inode->i_ino, DT_DIR); - if (ret < 0) - goto out; - result++; - coda_file->f_pos++; - } - if (coda_file->f_pos == 1) { - ret = filldir(buf, "..", 2, 1, parent_ino(de), DT_DIR); - if (ret < 0) - goto out; - result++; - coda_file->f_pos++; - } + if (!dir_emit_dots(coda_file, ctx)) + goto out; + while (1) { /* read entries from the directory file */ - ret = kernel_read(host_file, coda_file->f_pos - 2, (char *)vdir, + ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir, sizeof(*vdir)); if (ret < 0) { printk(KERN_ERR "coda readdir: read dir %s failed %d\n", @@ -507,32 +482,23 @@ static int coda_venus_readdir(struct file *coda_file, void *buf, /* Make sure we skip '.' and '..', we already got those */ if (name.name[0] == '.' && (name.len == 1 || - (vdir->d_name[1] == '.' && name.len == 2))) + (name.name[1] == '.' && name.len == 2))) vdir->d_fileno = name.len = 0; /* skip null entries */ if (vdir->d_fileno && name.len) { - /* try to look up this entry in the dcache, that way - * userspace doesn't have to worry about breaking - * getcwd by having mismatched inode numbers for - * internal volume mountpoints. */ - ino = find_inode_number(de, &name); - if (!ino) ino = vdir->d_fileno; - + ino = vdir->d_fileno; type = CDT2DT(vdir->d_type); - ret = filldir(buf, name.name, name.len, - coda_file->f_pos, ino, type); - /* failure means no space for filling in this round */ - if (ret < 0) break; - result++; + if (!dir_emit(ctx, name.name, name.len, ino, type)) + break; } /* we'll always have progress because d_reclen is unsigned and * we've already established it is non-zero. */ - coda_file->f_pos += vdir->d_reclen; + ctx->pos += vdir->d_reclen; } out: kfree(vdir); - return result ? result : ret; + return 0; } /* called when a cache lookup succeeds */ @@ -560,7 +526,7 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags) if (cii->c_flags & C_FLUSH) coda_flag_inode_children(inode, C_FLUSH); - if (de->d_count > 1) + if (d_count(de) > 1) /* pretend it's valid, but don't change the flags */ goto out; diff --git a/fs/coda/file.c b/fs/coda/file.c index fa4c100bdc7d..380b798f8443 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -79,6 +79,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo return -EINVAL; host_inode = file_inode(host_file); + file_start_write(host_file); mutex_lock(&coda_inode->i_mutex); ret = host_file->f_op->write(host_file, buf, count, ppos); @@ -87,6 +88,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC; mutex_unlock(&coda_inode->i_mutex); + file_end_write(host_file); return ret; } diff --git a/fs/compat.c b/fs/compat.c index d487985dd0ea..6af20de2c1a3 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -44,10 +44,10 @@ #include <linux/signal.h> #include <linux/poll.h> #include <linux/mm.h> -#include <linux/eventpoll.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -68,8 +68,6 @@ int compat_printk(const char *fmt, ...) return ret; } -#include "read_write.h" - /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. @@ -834,6 +832,7 @@ struct compat_old_linux_dirent { }; struct compat_readdir_callback { + struct dir_context ctx; struct compat_old_linux_dirent __user *dirent; int result; }; @@ -875,15 +874,15 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd, { int error; struct fd f = fdget(fd); - struct compat_readdir_callback buf; + struct compat_readdir_callback buf = { + .ctx.actor = compat_fillonedir, + .dirent = dirent + }; if (!f.file) return -EBADF; - buf.result = 0; - buf.dirent = dirent; - - error = vfs_readdir(f.file, compat_fillonedir, &buf); + error = iterate_dir(f.file, &buf.ctx); if (buf.result) error = buf.result; @@ -899,6 +898,7 @@ struct compat_linux_dirent { }; struct compat_getdents_callback { + struct dir_context ctx; struct compat_linux_dirent __user *current_dir; struct compat_linux_dirent __user *previous; int count; @@ -953,7 +953,11 @@ asmlinkage long compat_sys_getdents(unsigned int fd, { struct fd f; struct compat_linux_dirent __user * lastdirent; - struct compat_getdents_callback buf; + struct compat_getdents_callback buf = { + .ctx.actor = compat_filldir, + .current_dir = dirent, + .count = count + }; int error; if (!access_ok(VERIFY_WRITE, dirent, count)) @@ -963,17 +967,12 @@ asmlinkage long compat_sys_getdents(unsigned int fd, if (!f.file) return -EBADF; - buf.current_dir = dirent; - buf.previous = NULL; - buf.count = count; - buf.error = 0; - - error = vfs_readdir(f.file, compat_filldir, &buf); + error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; lastdirent = buf.previous; if (lastdirent) { - if (put_user(f.file->f_pos, &lastdirent->d_off)) + if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; @@ -985,6 +984,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd, #ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64 struct compat_getdents_callback64 { + struct dir_context ctx; struct linux_dirent64 __user *current_dir; struct linux_dirent64 __user *previous; int count; @@ -1038,7 +1038,11 @@ asmlinkage long compat_sys_getdents64(unsigned int fd, { struct fd f; struct linux_dirent64 __user * lastdirent; - struct compat_getdents_callback64 buf; + struct compat_getdents_callback64 buf = { + .ctx.actor = compat_filldir64, + .current_dir = dirent, + .count = count + }; int error; if (!access_ok(VERIFY_WRITE, dirent, count)) @@ -1048,17 +1052,12 @@ asmlinkage long compat_sys_getdents64(unsigned int fd, if (!f.file) return -EBADF; - buf.current_dir = dirent; - buf.previous = NULL; - buf.count = count; - buf.error = 0; - - error = vfs_readdir(f.file, compat_filldir64, &buf); + error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; lastdirent = buf.previous; if (lastdirent) { - typeof(lastdirent->d_off) d_off = f.file->f_pos; + typeof(lastdirent->d_off) d_off = buf.ctx.pos; if (__put_user_unaligned(d_off, &lastdirent->d_off)) error = -EFAULT; else @@ -1069,210 +1068,6 @@ asmlinkage long compat_sys_getdents64(unsigned int fd, } #endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */ -static ssize_t compat_do_readv_writev(int type, struct file *file, - const struct compat_iovec __user *uvector, - unsigned long nr_segs, loff_t *pos) -{ - compat_ssize_t tot_len; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - ssize_t ret; - io_fn_t fn; - iov_fn_t fnv; - - ret = -EINVAL; - if (!file->f_op) - goto out; - - ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, - UIO_FASTIOV, iovstack, &iov); - if (ret <= 0) - goto out; - - tot_len = ret; - ret = rw_verify_area(type, file, pos, tot_len); - if (ret < 0) - goto out; - - fnv = NULL; - if (type == READ) { - fn = file->f_op->read; - fnv = file->f_op->aio_read; - } else { - fn = (io_fn_t)file->f_op->write; - fnv = file->f_op->aio_write; - } - - if (fnv) - ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, - pos, fnv); - else - ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); - -out: - if (iov != iovstack) - kfree(iov); - if ((ret + (type == READ)) > 0) { - if (type == READ) - fsnotify_access(file); - else - fsnotify_modify(file); - } - return ret; -} - -static size_t compat_readv(struct file *file, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos) -{ - ssize_t ret = -EBADF; - - if (!(file->f_mode & FMODE_READ)) - goto out; - - ret = -EINVAL; - if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) - goto out; - - ret = compat_do_readv_writev(READ, file, vec, vlen, pos); - -out: - if (ret > 0) - add_rchar(current, ret); - inc_syscr(current); - return ret; -} - -asmlinkage ssize_t -compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen) -{ - struct fd f = fdget(fd); - ssize_t ret; - loff_t pos; - - if (!f.file) - return -EBADF; - pos = f.file->f_pos; - ret = compat_readv(f.file, vec, vlen, &pos); - f.file->f_pos = pos; - fdput(f); - return ret; -} - -asmlinkage ssize_t -compat_sys_preadv64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos) -{ - struct fd f; - ssize_t ret; - - if (pos < 0) - return -EINVAL; - f = fdget(fd); - if (!f.file) - return -EBADF; - ret = -ESPIPE; - if (f.file->f_mode & FMODE_PREAD) - ret = compat_readv(f.file, vec, vlen, &pos); - fdput(f); - return ret; -} - -asmlinkage ssize_t -compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, u32 pos_low, u32 pos_high) -{ - loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return compat_sys_preadv64(fd, vec, vlen, pos); -} - -static size_t compat_writev(struct file *file, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos) -{ - ssize_t ret = -EBADF; - - if (!(file->f_mode & FMODE_WRITE)) - goto out; - - ret = -EINVAL; - if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) - goto out; - - ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); - -out: - if (ret > 0) - add_wchar(current, ret); - inc_syscw(current); - return ret; -} - -asmlinkage ssize_t -compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen) -{ - struct fd f = fdget(fd); - ssize_t ret; - loff_t pos; - - if (!f.file) - return -EBADF; - pos = f.file->f_pos; - ret = compat_writev(f.file, vec, vlen, &pos); - f.file->f_pos = pos; - fdput(f); - return ret; -} - -asmlinkage ssize_t -compat_sys_pwritev64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos) -{ - struct fd f; - ssize_t ret; - - if (pos < 0) - return -EINVAL; - f = fdget(fd); - if (!f.file) - return -EBADF; - ret = -ESPIPE; - if (f.file->f_mode & FMODE_PWRITE) - ret = compat_writev(f.file, vec, vlen, &pos); - fdput(f); - return ret; -} - -asmlinkage ssize_t -compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, u32 pos_low, u32 pos_high) -{ - loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return compat_sys_pwritev64(fd, vec, vlen, pos); -} - -asmlinkage long -compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32, - unsigned int nr_segs, unsigned int flags) -{ - unsigned i; - struct iovec __user *iov; - if (nr_segs > UIO_MAXIOV) - return -EINVAL; - iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); - for (i = 0; i < nr_segs; i++) { - struct compat_iovec v; - if (get_user(v.iov_base, &iov32[i].iov_base) || - get_user(v.iov_len, &iov32[i].iov_len) || - put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || - put_user(v.iov_len, &iov[i].iov_len)) - return -EFAULT; - } - return sys_vmsplice(fd, iov, nr_segs, flags); -} - /* * Exactly like fs/open.c:sys_open(), except that it doesn't set the * O_LARGEFILE flag. @@ -1658,84 +1453,6 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, return ret; } -#ifdef CONFIG_EPOLL - -asmlinkage long compat_sys_epoll_pwait(int epfd, - struct compat_epoll_event __user *events, - int maxevents, int timeout, - const compat_sigset_t __user *sigmask, - compat_size_t sigsetsize) -{ - long err; - compat_sigset_t csigmask; - sigset_t ksigmask, sigsaved; - - /* - * If the caller wants a certain signal mask to be set during the wait, - * we apply it here. - */ - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) - return -EFAULT; - sigset_from_compat(&ksigmask, &csigmask); - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } - - err = sys_epoll_wait(epfd, events, maxevents, timeout); - - /* - * If we changed the signal mask, we need to restore the original one. - * In case we've got a signal while waiting, we do not restore the - * signal mask yet, and we allow do_signal() to deliver the signal on - * the way back to userspace, before the signal mask is restored. - */ - if (sigmask) { - if (err == -EINTR) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } else - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - } - - return err; -} - -#endif /* CONFIG_EPOLL */ - -#ifdef CONFIG_SIGNALFD - -asmlinkage long compat_sys_signalfd4(int ufd, - const compat_sigset_t __user *sigmask, - compat_size_t sigsetsize, int flags) -{ - compat_sigset_t ss32; - sigset_t tmp; - sigset_t __user *ksigmask; - - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) - return -EFAULT; - sigset_from_compat(&tmp, &ss32); - ksigmask = compat_alloc_user_space(sizeof(sigset_t)); - if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t))) - return -EFAULT; - - return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags); -} - -asmlinkage long compat_sys_signalfd(int ufd, - const compat_sigset_t __user *sigmask, - compat_size_t sigsetsize) -{ - return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0); -} -#endif /* CONFIG_SIGNALFD */ - #ifdef CONFIG_FHANDLE /* * Exactly like fs/open.c:sys_open_by_handle_at(), except that it @@ -1747,25 +1464,3 @@ COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, return do_handle_open(mountdirfd, handle, flags); } #endif - -#ifdef __ARCH_WANT_COMPAT_SYS_SENDFILE -asmlinkage long compat_sys_sendfile(int out_fd, int in_fd, - compat_off_t __user *offset, compat_size_t count) -{ - loff_t pos; - off_t off; - ssize_t ret; - - if (offset) { - if (unlikely(get_user(off, offset))) - return -EFAULT; - pos = off; - ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); - if (unlikely(put_user(pos, offset))) - return -EFAULT; - return ret; - } - - return do_sendfile(out_fd, in_fd, NULL, count, 0); -} -#endif /* __ARCH_WANT_COMPAT_SYS_SENDFILE */ diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 3ced75f765ca..5d19acfa7c6c 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -66,7 +66,6 @@ #include <linux/gigaset_dev.h> #ifdef CONFIG_BLOCK -#include <linux/loop.h> #include <linux/cdrom.h> #include <linux/fd.h> #include <scsi/scsi.h> @@ -608,7 +607,6 @@ struct serial_struct32 { static int serial_struct_ioctl(unsigned fd, unsigned cmd, struct serial_struct32 __user *ss32) { - typedef struct serial_struct SS; typedef struct serial_struct32 SS32; int err; struct serial_struct ss; @@ -955,8 +953,6 @@ COMPATIBLE_IOCTL(MTIOCTOP) /* Socket level stuff */ COMPATIBLE_IOCTL(FIOQSIZE) #ifdef CONFIG_BLOCK -/* loop */ -IGNORE_IOCTL(LOOP_CLR_FD) /* md calls this on random blockdevs */ IGNORE_IOCTL(RAID_VERSION) /* qemu/qemu-img might call these two on plain files for probing */ diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 7aabc6ad4e9b..5e7c60c1cb63 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -387,7 +387,7 @@ static void remove_dir(struct dentry * d) if (d->d_inode) simple_rmdir(parent->d_inode,d); - pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count); + pr_debug(" o %s removing done (%d)\n",d->d_name.name, d_count(d)); dput(parent); } @@ -1532,84 +1532,66 @@ static inline unsigned char dt_type(struct configfs_dirent *sd) return (sd->s_mode >> 12) & 15; } -static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir) +static int configfs_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file->f_path.dentry; struct super_block *sb = dentry->d_sb; struct configfs_dirent * parent_sd = dentry->d_fsdata; - struct configfs_dirent *cursor = filp->private_data; + struct configfs_dirent *cursor = file->private_data; struct list_head *p, *q = &cursor->s_sibling; ino_t ino = 0; - int i = filp->f_pos; - switch (i) { - case 0: - ino = dentry->d_inode->i_ino; - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - break; - filp->f_pos++; - i++; - /* fallthrough */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) - break; - filp->f_pos++; - i++; - /* fallthrough */ - default: - if (filp->f_pos == 2) { - spin_lock(&configfs_dirent_lock); - list_move(q, &parent_sd->s_children); - spin_unlock(&configfs_dirent_lock); - } - for (p=q->next; p!= &parent_sd->s_children; p=p->next) { - struct configfs_dirent *next; - const char * name; - int len; - struct inode *inode = NULL; + if (!dir_emit_dots(file, ctx)) + return 0; + if (ctx->pos == 2) { + spin_lock(&configfs_dirent_lock); + list_move(q, &parent_sd->s_children); + spin_unlock(&configfs_dirent_lock); + } + for (p = q->next; p != &parent_sd->s_children; p = p->next) { + struct configfs_dirent *next; + const char *name; + int len; + struct inode *inode = NULL; + + next = list_entry(p, struct configfs_dirent, s_sibling); + if (!next->s_element) + continue; - next = list_entry(p, struct configfs_dirent, - s_sibling); - if (!next->s_element) - continue; - - name = configfs_get_name(next); - len = strlen(name); - - /* - * We'll have a dentry and an inode for - * PINNED items and for open attribute - * files. We lock here to prevent a race - * with configfs_d_iput() clearing - * s_dentry before calling iput(). - * - * Why do we go to the trouble? If - * someone has an attribute file open, - * the inode number should match until - * they close it. Beyond that, we don't - * care. - */ - spin_lock(&configfs_dirent_lock); - dentry = next->s_dentry; - if (dentry) - inode = dentry->d_inode; - if (inode) - ino = inode->i_ino; - spin_unlock(&configfs_dirent_lock); - if (!inode) - ino = iunique(sb, 2); + name = configfs_get_name(next); + len = strlen(name); + + /* + * We'll have a dentry and an inode for + * PINNED items and for open attribute + * files. We lock here to prevent a race + * with configfs_d_iput() clearing + * s_dentry before calling iput(). + * + * Why do we go to the trouble? If + * someone has an attribute file open, + * the inode number should match until + * they close it. Beyond that, we don't + * care. + */ + spin_lock(&configfs_dirent_lock); + dentry = next->s_dentry; + if (dentry) + inode = dentry->d_inode; + if (inode) + ino = inode->i_ino; + spin_unlock(&configfs_dirent_lock); + if (!inode) + ino = iunique(sb, 2); - if (filldir(dirent, name, len, filp->f_pos, ino, - dt_type(next)) < 0) - return 0; + if (!dir_emit(ctx, name, len, ino, dt_type(next))) + return 0; - spin_lock(&configfs_dirent_lock); - list_move(q, p); - spin_unlock(&configfs_dirent_lock); - p = q; - filp->f_pos++; - } + spin_lock(&configfs_dirent_lock); + list_move(q, p); + spin_unlock(&configfs_dirent_lock); + p = q; + ctx->pos++; } return 0; } @@ -1661,7 +1643,7 @@ const struct file_operations configfs_dir_operations = { .release = configfs_dir_close, .llseek = configfs_dir_lseek, .read = generic_read_dir, - .readdir = configfs_readdir, + .iterate = configfs_readdir, }; int configfs_register_subsystem(struct configfs_subsystem *subsys) diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 2b6cb23dd14e..1d1c41f1014d 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -203,7 +203,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof mutex_lock(&buffer->mutex); len = fill_write_buffer(buffer, buf, count); if (len > 0) - len = flush_write_buffer(file->f_path.dentry, buffer, count); + len = flush_write_buffer(file->f_path.dentry, buffer, len); if (len > 0) *ppos += len; mutex_unlock(&buffer->mutex); diff --git a/fs/coredump.c b/fs/coredump.c index c6479658d487..72f816d6cad9 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -45,69 +45,79 @@ #include <trace/events/sched.h> int core_uses_pid; -char core_pattern[CORENAME_MAX_SIZE] = "core"; unsigned int core_pipe_limit; +char core_pattern[CORENAME_MAX_SIZE] = "core"; +static int core_name_size = CORENAME_MAX_SIZE; struct core_name { char *corename; int used, size; }; -static atomic_t call_count = ATOMIC_INIT(1); /* The maximal length of core_pattern is also specified in sysctl.c */ -static int expand_corename(struct core_name *cn) +static int expand_corename(struct core_name *cn, int size) { - char *old_corename = cn->corename; + char *corename = krealloc(cn->corename, size, GFP_KERNEL); - cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count); - cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL); - - if (!cn->corename) { - kfree(old_corename); + if (!corename) return -ENOMEM; - } + if (size > core_name_size) /* racy but harmless */ + core_name_size = size; + + cn->size = ksize(corename); + cn->corename = corename; return 0; } +static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg) +{ + int free, need; + +again: + free = cn->size - cn->used; + need = vsnprintf(cn->corename + cn->used, free, fmt, arg); + if (need < free) { + cn->used += need; + return 0; + } + + if (!expand_corename(cn, cn->size + need - free + 1)) + goto again; + + return -ENOMEM; +} + static int cn_printf(struct core_name *cn, const char *fmt, ...) { - char *cur; - int need; - int ret; va_list arg; + int ret; va_start(arg, fmt); - need = vsnprintf(NULL, 0, fmt, arg); + ret = cn_vprintf(cn, fmt, arg); va_end(arg); - if (likely(need < cn->size - cn->used - 1)) - goto out_printf; + return ret; +} - ret = expand_corename(cn); - if (ret) - goto expand_fail; +static int cn_esc_printf(struct core_name *cn, const char *fmt, ...) +{ + int cur = cn->used; + va_list arg; + int ret; -out_printf: - cur = cn->corename + cn->used; va_start(arg, fmt); - vsnprintf(cur, need + 1, fmt, arg); + ret = cn_vprintf(cn, fmt, arg); va_end(arg); - cn->used += need; - return 0; -expand_fail: + for (; cur < cn->used; ++cur) { + if (cn->corename[cur] == '/') + cn->corename[cur] = '!'; + } return ret; } -static void cn_escape(char *str) -{ - for (; *str; str++) - if (*str == '/') - *str = '!'; -} - static int cn_print_exe_file(struct core_name *cn) { struct file *exe_file; @@ -115,12 +125,8 @@ static int cn_print_exe_file(struct core_name *cn) int ret; exe_file = get_mm_exe_file(current->mm); - if (!exe_file) { - char *commstart = cn->corename + cn->used; - ret = cn_printf(cn, "%s (path unknown)", current->comm); - cn_escape(commstart); - return ret; - } + if (!exe_file) + return cn_esc_printf(cn, "%s (path unknown)", current->comm); pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); if (!pathbuf) { @@ -134,9 +140,7 @@ static int cn_print_exe_file(struct core_name *cn) goto free_buf; } - cn_escape(path); - - ret = cn_printf(cn, "%s", path); + ret = cn_esc_printf(cn, "%s", path); free_buf: kfree(pathbuf); @@ -157,19 +161,19 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) int pid_in_pattern = 0; int err = 0; - cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count); - cn->corename = kmalloc(cn->size, GFP_KERNEL); cn->used = 0; - - if (!cn->corename) + cn->corename = NULL; + if (expand_corename(cn, core_name_size)) return -ENOMEM; + cn->corename[0] = '\0'; + + if (ispipe) + ++pat_ptr; /* Repeat as long as we have more pattern to process and more output space */ while (*pat_ptr) { if (*pat_ptr != '%') { - if (*pat_ptr == 0) - goto out; err = cn_printf(cn, "%c", *pat_ptr++); } else { switch (*++pat_ptr) { @@ -210,22 +214,16 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) break; } /* hostname */ - case 'h': { - char *namestart = cn->corename + cn->used; + case 'h': down_read(&uts_sem); - err = cn_printf(cn, "%s", + err = cn_esc_printf(cn, "%s", utsname()->nodename); up_read(&uts_sem); - cn_escape(namestart); break; - } /* executable */ - case 'e': { - char *commstart = cn->corename + cn->used; - err = cn_printf(cn, "%s", current->comm); - cn_escape(commstart); + case 'e': + err = cn_esc_printf(cn, "%s", current->comm); break; - } case 'E': err = cn_print_exe_file(cn); break; @@ -244,6 +242,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) return err; } +out: /* Backward compatibility with core_uses_pid: * * If core_pattern does not include a %p (as is the default) @@ -254,7 +253,6 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) if (err) return err; } -out: return ispipe; } @@ -263,7 +261,6 @@ static int zap_process(struct task_struct *start, int exit_code) struct task_struct *t; int nr = 0; - start->signal->flags = SIGNAL_GROUP_EXIT; start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; @@ -280,8 +277,8 @@ static int zap_process(struct task_struct *start, int exit_code) return nr; } -static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, - struct core_state *core_state, int exit_code) +static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, + struct core_state *core_state, int exit_code) { struct task_struct *g, *p; unsigned long flags; @@ -291,11 +288,16 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (!signal_group_exit(tsk->signal)) { mm->core_state = core_state; nr = zap_process(tsk, exit_code); + tsk->signal->group_exit_task = tsk; + /* ignore all signals except SIGKILL, see prepare_signal() */ + tsk->signal->flags = SIGNAL_GROUP_COREDUMP; + clear_tsk_thread_flag(tsk, TIF_SIGPENDING); } spin_unlock_irq(&tsk->sighand->siglock); if (unlikely(nr < 0)) return nr; + tsk->flags = PF_DUMPCORE; if (atomic_read(&mm->mm_users) == nr + 1) goto done; /* @@ -340,6 +342,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (unlikely(p->mm == mm)) { lock_task_sighand(p, &flags); nr += zap_process(p, exit_code); + p->signal->flags = SIGNAL_GROUP_EXIT; unlock_task_sighand(p, &flags); } break; @@ -386,11 +389,18 @@ static int coredump_wait(int exit_code, struct core_state *core_state) return core_waiters; } -static void coredump_finish(struct mm_struct *mm) +static void coredump_finish(struct mm_struct *mm, bool core_dumped) { struct core_thread *curr, *next; struct task_struct *task; + spin_lock_irq(¤t->sighand->siglock); + if (core_dumped && !__fatal_signal_pending(current)) + current->signal->group_exit_code |= 0x80; + current->signal->group_exit_task = NULL; + current->signal->flags = SIGNAL_GROUP_EXIT; + spin_unlock_irq(¤t->sighand->siglock); + next = mm->core_state->dumper.next; while ((curr = next) != NULL) { next = curr->next; @@ -407,26 +417,38 @@ static void coredump_finish(struct mm_struct *mm) mm->core_state = NULL; } -static void wait_for_dump_helpers(struct file *file) +static bool dump_interrupted(void) { - struct pipe_inode_info *pipe; + /* + * SIGKILL or freezing() interrupt the coredumping. Perhaps we + * can do try_to_freeze() and check __fatal_signal_pending(), + * but then we need to teach dump_write() to restart and clear + * TIF_SIGPENDING. + */ + return signal_pending(current); +} - pipe = file_inode(file)->i_pipe; +static void wait_for_dump_helpers(struct file *file) +{ + struct pipe_inode_info *pipe = file->private_data; pipe_lock(pipe); pipe->readers++; pipe->writers--; + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + pipe_unlock(pipe); - while ((pipe->readers > 1) && (!signal_pending(current))) { - wake_up_interruptible_sync(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - pipe_wait(pipe); - } + /* + * We actually want wait_event_freezable() but then we need + * to clear TIF_SIGPENDING and improve dump_interrupted(). + */ + wait_event_interruptible(pipe->wait, pipe->readers == 1); + pipe_lock(pipe); pipe->readers--; pipe->writers++; pipe_unlock(pipe); - } /* @@ -471,6 +493,7 @@ void do_coredump(siginfo_t *siginfo) int ispipe; struct files_struct *displaced; bool need_nonrelative = false; + bool core_dumped = false; static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { .siginfo = siginfo, @@ -514,22 +537,17 @@ void do_coredump(siginfo_t *siginfo) old_cred = override_creds(cred); - /* - * Clear any false indication of pending signals that might - * be seen by the filesystem code called to write the core file. - */ - clear_thread_flag(TIF_SIGPENDING); - ispipe = format_corename(&cn, &cprm); - if (ispipe) { + if (ispipe) { int dump_count; char **helper_argv; + struct subprocess_info *sub_info; if (ispipe < 0) { printk(KERN_WARNING "format_corename failed\n"); printk(KERN_WARNING "Aborting core\n"); - goto fail_corename; + goto fail_unlock; } if (cprm.limit == 1) { @@ -564,22 +582,27 @@ void do_coredump(siginfo_t *siginfo) goto fail_dropcount; } - helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL); + helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL); if (!helper_argv) { printk(KERN_WARNING "%s failed to allocate memory\n", __func__); goto fail_dropcount; } - retval = call_usermodehelper_fns(helper_argv[0], helper_argv, - NULL, UMH_WAIT_EXEC, umh_pipe_setup, - NULL, &cprm); + retval = -ENOMEM; + sub_info = call_usermodehelper_setup(helper_argv[0], + helper_argv, NULL, GFP_KERNEL, + umh_pipe_setup, NULL, &cprm); + if (sub_info) + retval = call_usermodehelper_exec(sub_info, + UMH_WAIT_EXEC); + argv_free(helper_argv); if (retval) { - printk(KERN_INFO "Core dump to %s pipe failed\n", + printk(KERN_INFO "Core dump to |%s pipe failed\n", cn.corename); goto close_fail; - } + } } else { struct inode *inode; @@ -629,10 +652,11 @@ void do_coredump(siginfo_t *siginfo) goto close_fail; if (displaced) put_files_struct(displaced); - retval = binfmt->core_dump(&cprm); - if (retval) - current->signal->group_exit_code |= 0x80; - + if (!dump_interrupted()) { + file_start_write(cprm.file); + core_dumped = binfmt->core_dump(&cprm); + file_end_write(cprm.file); + } if (ispipe && core_pipe_limit) wait_for_dump_helpers(cprm.file); close_fail: @@ -643,8 +667,7 @@ fail_dropcount: atomic_dec(&core_dump_count); fail_unlock: kfree(cn.corename); -fail_corename: - coredump_finish(mm); + coredump_finish(mm, core_dumped); revert_creds(old_cred); fail_creds: put_cred(cred); @@ -659,7 +682,9 @@ fail: */ int dump_write(struct file *file, const void *addr, int nr) { - return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; + return !dump_interrupted() && + access_ok(VERIFY_READ, addr, nr) && + file->f_op->write(file, addr, nr, &file->f_pos) == nr; } EXPORT_SYMBOL(dump_write); @@ -668,7 +693,8 @@ int dump_seek(struct file *file, loff_t off) int ret = 1; if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - if (file->f_op->llseek(file, off, SEEK_CUR) < 0) + if (dump_interrupted() || + file->f_op->llseek(file, off, SEEK_CUR) < 0) return 0; } else { char *buf = (char *)get_zeroed_page(GFP_KERNEL); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 35b1c7bd18b7..e501ac3a49ff 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -349,18 +349,17 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) /* * Read a cramfs directory entry. */ -static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int cramfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; char *buf; unsigned int offset; - int copied; /* Offset within the thing. */ - offset = filp->f_pos; - if (offset >= inode->i_size) + if (ctx->pos >= inode->i_size) return 0; + offset = ctx->pos; /* Directory entries are always 4-byte aligned */ if (offset & 3) return -EINVAL; @@ -369,14 +368,13 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (!buf) return -ENOMEM; - copied = 0; while (offset < inode->i_size) { struct cramfs_inode *de; unsigned long nextoffset; char *name; ino_t ino; umode_t mode; - int namelen, error; + int namelen; mutex_lock(&read_mutex); de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN); @@ -402,13 +400,10 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) break; namelen--; } - error = filldir(dirent, buf, namelen, offset, ino, mode >> 12); - if (error) + if (!dir_emit(ctx, buf, namelen, ino, mode >> 12)) break; - offset = nextoffset; - filp->f_pos = offset; - copied++; + ctx->pos = offset = nextoffset; } kfree(buf); return 0; @@ -547,7 +542,7 @@ static const struct address_space_operations cramfs_aops = { static const struct file_operations cramfs_directory_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = cramfs_readdir, + .iterate = cramfs_readdir, }; static const struct inode_operations cramfs_dir_inode_operations = { diff --git a/fs/dcache.c b/fs/dcache.c index e8bc3420d63e..87bdb5329c3c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -337,23 +337,6 @@ static void dentry_lru_del(struct dentry *dentry) } } -/* - * Remove a dentry that is unreferenced and about to be pruned - * (unhashed and destroyed) from the LRU, and inform the file system. - * This wrapper should be called _prior_ to unhashing a victim dentry. - */ -static void dentry_lru_prune(struct dentry *dentry) -{ - if (!list_empty(&dentry->d_lru)) { - if (dentry->d_flags & DCACHE_OP_PRUNE) - dentry->d_op->d_prune(dentry); - - spin_lock(&dcache_lru_lock); - __dentry_lru_del(dentry); - spin_unlock(&dcache_lru_lock); - } -} - static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) { spin_lock(&dcache_lru_lock); @@ -486,11 +469,13 @@ relock: if (ref) dentry->d_count--; /* - * if dentry was on the d_lru list delete it from there. * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ - dentry_lru_prune(dentry); + if (dentry->d_flags & DCACHE_OP_PRUNE) + dentry->d_op->d_prune(dentry); + + dentry_lru_del(dentry); /* if it was on the hash then remove it */ __d_drop(dentry); return d_kill(dentry, parent); @@ -919,11 +904,13 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) struct inode *inode; /* - * remove the dentry from the lru, and inform - * the fs that this dentry is about to be + * inform the fs that this dentry is about to be * unhashed and destroyed. */ - dentry_lru_prune(dentry); + if (dentry->d_flags & DCACHE_OP_PRUNE) + dentry->d_op->d_prune(dentry); + + dentry_lru_del(dentry); __d_shrink(dentry); if (dentry->d_count != 0) { @@ -1230,8 +1217,10 @@ void shrink_dcache_parent(struct dentry * parent) LIST_HEAD(dispose); int found; - while ((found = select_parent(parent, &dispose)) != 0) + while ((found = select_parent(parent, &dispose)) != 0) { shrink_dentry_list(&dispose); + cond_resched(); + } } EXPORT_SYMBOL(shrink_dcache_parent); @@ -1623,6 +1612,10 @@ EXPORT_SYMBOL(d_obtain_alias); * If a dentry was found and moved, then it is returned. Otherwise NULL * is returned. This matches the expected return value of ->lookup. * + * Cluster filesystems may call this function with a negative, hashed dentry. + * In that case, we know that the inode will be a regular file, and also this + * will only occur during atomic_open. So we need to check for the dentry + * being already hashed only in the final case. */ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { @@ -1647,8 +1640,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) security_d_instantiate(dentry, inode); d_rehash(dentry); } - } else - d_add(dentry, inode); + } else { + d_instantiate(dentry, inode); + if (d_unhashed(dentry)) + d_rehash(dentry); + } return new; } EXPORT_SYMBOL(d_splice_alias); @@ -1734,7 +1730,7 @@ EXPORT_SYMBOL(d_add_ci); * Do the slow-case of the dentry name compare. * * Unlike the dentry_cmp() function, we need to atomically - * load the name, length and inode information, so that the + * load the name and length information, so that the * filesystem can rely on them, and can use the 'name' and * 'len' information without worrying about walking off the * end of memory etc. @@ -1752,22 +1748,18 @@ enum slow_d_compare { static noinline enum slow_d_compare slow_dentry_cmp( const struct dentry *parent, - struct inode *inode, struct dentry *dentry, unsigned int seq, const struct qstr *name) { int tlen = dentry->d_name.len; const char *tname = dentry->d_name.name; - struct inode *i = dentry->d_inode; if (read_seqcount_retry(&dentry->d_seq, seq)) { cpu_relax(); return D_COMP_SEQRETRY; } - if (parent->d_op->d_compare(parent, inode, - dentry, i, - tlen, tname, name)) + if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) return D_COMP_NOMATCH; return D_COMP_OK; } @@ -1777,7 +1769,6 @@ static noinline enum slow_d_compare slow_dentry_cmp( * @parent: parent dentry * @name: qstr of name we wish to find * @seqp: returns d_seq value at the point where the dentry was found - * @inode: returns dentry->d_inode when the inode was found valid. * Returns: dentry, or NULL * * __d_lookup_rcu is the dcache lookup function for rcu-walk name @@ -1804,7 +1795,7 @@ static noinline enum slow_d_compare slow_dentry_cmp( */ struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, - unsigned *seqp, struct inode *inode) + unsigned *seqp) { u64 hashlen = name->hash_len; const unsigned char *str = name->name; @@ -1838,11 +1829,10 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, seqretry: /* * The dentry sequence count protects us from concurrent - * renames, and thus protects inode, parent and name fields. + * renames, and thus protects parent and name fields. * * The caller must perform a seqcount check in order - * to do anything useful with the returned dentry, - * including using the 'd_inode' pointer. + * to do anything useful with the returned dentry. * * NOTE! We do a "raw" seqcount_begin here. That means that * we don't wait for the sequence count to stabilize if it @@ -1856,12 +1846,12 @@ seqretry: continue; if (d_unhashed(dentry)) continue; - *seqp = seq; if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { if (dentry->d_name.hash != hashlen_hash(hashlen)) continue; - switch (slow_dentry_cmp(parent, inode, dentry, seq, name)) { + *seqp = seq; + switch (slow_dentry_cmp(parent, dentry, seq, name)) { case D_COMP_OK: return dentry; case D_COMP_NOMATCH: @@ -1873,6 +1863,7 @@ seqretry: if (dentry->d_name.hash_len != hashlen) continue; + *seqp = seq; if (!dentry_cmp(dentry, str, hashlen_len(hashlen))) return dentry; } @@ -1970,9 +1961,7 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) if (parent->d_flags & DCACHE_OP_COMPARE) { int tlen = dentry->d_name.len; const char *tname = dentry->d_name.name; - if (parent->d_op->d_compare(parent, parent->d_inode, - dentry, dentry->d_inode, - tlen, tname, name)) + if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) goto next; } else { if (dentry->d_name.len != len) @@ -2009,7 +1998,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) */ name->hash = full_name_hash(name->name, name->len); if (dir->d_flags & DCACHE_OP_HASH) { - int err = dir->d_op->d_hash(dir, dir->d_inode, name); + int err = dir->d_op->d_hash(dir, name); if (unlikely(err < 0)) return ERR_PTR(err); } @@ -2408,8 +2397,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) dentry->d_parent = dentry; list_del_init(&dentry->d_u.d_child); anon->d_parent = dparent; - list_del(&anon->d_u.d_child); - list_add(&anon->d_u.d_child, &dparent->d_subdirs); + list_move(&anon->d_u.d_child, &dparent->d_subdirs); write_seqcount_end(&dentry->d_seq); write_seqcount_end(&anon->d_seq); @@ -2980,34 +2968,21 @@ rename_retry: goto again; } -/** - * find_inode_number - check for dentry with name - * @dir: directory to check - * @name: Name to find. - * - * Check whether a dentry already exists for the given name, - * and return the inode number if it has an inode. Otherwise - * 0 is returned. - * - * This routine is used to post-process directory listings for - * filesystems using synthetic inode numbers, and is necessary - * to keep getcwd() working. - */ - -ino_t find_inode_number(struct dentry *dir, struct qstr *name) +void d_tmpfile(struct dentry *dentry, struct inode *inode) { - struct dentry * dentry; - ino_t ino = 0; - - dentry = d_hash_and_lookup(dir, name); - if (!IS_ERR_OR_NULL(dentry)) { - if (dentry->d_inode) - ino = dentry->d_inode->i_ino; - dput(dentry); - } - return ino; + inode_dec_link_count(inode); + BUG_ON(dentry->d_name.name != dentry->d_iname || + !hlist_unhashed(&dentry->d_alias) || + !d_unlinked(dentry)); + spin_lock(&dentry->d_parent->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + dentry->d_name.len = sprintf(dentry->d_iname, "#%llu", + (unsigned long long)inode->i_ino); + spin_unlock(&dentry->d_lock); + spin_unlock(&dentry->d_parent->d_lock); + d_instantiate(dentry, inode); } -EXPORT_SYMBOL(find_inode_number); +EXPORT_SYMBOL(d_tmpfile); static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) diff --git a/fs/dcookies.c b/fs/dcookies.c index 17c779967828..ab5954b50267 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -25,6 +25,7 @@ #include <linux/dcookies.h> #include <linux/mutex.h> #include <linux/path.h> +#include <linux/compat.h> #include <asm/uaccess.h> /* The dcookies are allocated from a kmem_cache and @@ -145,7 +146,7 @@ out: /* And here is where the userspace process can look up the cookie value * to retrieve the path. */ -SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len) +SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len) { unsigned long cookie = (unsigned long)cookie64; int err = -EINVAL; @@ -201,12 +202,16 @@ out: mutex_unlock(&dcookie_mutex); return err; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_lookup_dcookie(u64 cookie64, long buf, long len) + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len) { - return SYSC_lookup_dcookie(cookie64, (char __user *) buf, (size_t) len); +#ifdef __BIG_ENDIAN + return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); +#else + return sys_lookup_dcookie(((u64)w1 << 32) | w0, buf, len); +#endif } -SYSCALL_ALIAS(sys_lookup_dcookie, SyS_lookup_dcookie); #endif static int dcookie_init(void) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index c5ca6ae5a30c..63146295153b 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -21,6 +21,7 @@ #include <linux/debugfs.h> #include <linux/io.h> #include <linux/slab.h> +#include <linux/atomic.h> static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -403,6 +404,47 @@ struct dentry *debugfs_create_size_t(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_size_t); +static int debugfs_atomic_t_set(void *data, u64 val) +{ + atomic_set((atomic_t *)data, val); + return 0; +} +static int debugfs_atomic_t_get(void *data, u64 *val) +{ + *val = atomic_read((atomic_t *)data); + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get, + debugfs_atomic_t_set, "%lld\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); + +/** + * debugfs_create_atomic_t - create a debugfs file that is used to read and + * write an atomic_t value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, + struct dentry *parent, atomic_t *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, + &fops_atomic_t_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, + &fops_atomic_t_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_atomic_t); +} +EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); static ssize_t read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) @@ -431,6 +473,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf, if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; + buf[buf_size] = '\0'; if (strtobool(buf, &bv) == 0) *val = bv; diff --git a/fs/direct-io.c b/fs/direct-io.c index f853263cf74f..7ab90f5081ee 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -37,6 +37,7 @@ #include <linux/uio.h> #include <linux/atomic.h> #include <linux/prefetch.h> +#include <linux/aio.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -441,8 +442,8 @@ static struct bio *dio_await_one(struct dio *dio) static int dio_bio_complete(struct dio *dio, struct bio *bio) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec; - int page_no; + struct bio_vec *bvec; + unsigned i; if (!uptodate) dio->io_error = -EIO; @@ -450,8 +451,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) if (dio->is_async && dio->rw == READ) { bio_check_pages_dirty(bio); /* transfers ownership */ } else { - for (page_no = 0; page_no < bio->bi_vcnt; page_no++) { - struct page *page = bvec[page_no].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (dio->rw == READ && !PageCompound(page)) set_page_dirty_lock(page); @@ -672,12 +673,6 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, if (sdio->final_block_in_bio != sdio->cur_page_block || cur_offset != bio_next_offset) dio_bio_submit(dio, sdio); - /* - * Submit now if the underlying fs is about to perform a - * metadata read - */ - else if (sdio->boundary) - dio_bio_submit(dio, sdio); } if (sdio->bio == NULL) { @@ -737,16 +732,6 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, sdio->cur_page_block + (sdio->cur_page_len >> sdio->blkbits) == blocknr) { sdio->cur_page_len += len; - - /* - * If sdio->boundary then we want to schedule the IO now to - * avoid metadata seeks. - */ - if (sdio->boundary) { - ret = dio_send_cur_page(dio, sdio, map_bh); - page_cache_release(sdio->cur_page); - sdio->cur_page = NULL; - } goto out; } @@ -758,7 +743,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, page_cache_release(sdio->cur_page); sdio->cur_page = NULL; if (ret) - goto out; + return ret; } page_cache_get(page); /* It is in dio */ @@ -768,6 +753,16 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, sdio->cur_page_block = blocknr; sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; out: + /* + * If sdio->boundary then we want to schedule the IO now to + * avoid metadata seeks. + */ + if (sdio->boundary) { + ret = dio_send_cur_page(dio, sdio, map_bh); + dio_bio_submit(dio, sdio); + page_cache_release(sdio->cur_page); + sdio->cur_page = NULL; + } return ret; } @@ -969,7 +964,8 @@ do_holes: this_chunk_bytes = this_chunk_blocks << blkbits; BUG_ON(this_chunk_bytes == 0); - sdio->boundary = buffer_boundary(map_bh); + if (this_chunk_blocks == sdio->blocks_available) + sdio->boundary = buffer_boundary(map_bh); ret = submit_page_section(dio, sdio, page, offset_in_page, this_chunk_bytes, diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 7d58d5b112b5..76feb4b60fa6 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -138,8 +138,9 @@ static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf) static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl, const char *buf, size_t len) { - strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN); - strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN); + strlcpy(dlm_config.ci_cluster_name, buf, + sizeof(dlm_config.ci_cluster_name)); + strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name)); return len; } diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 1b1146670c4b..e223a911a834 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -2038,8 +2038,8 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; if (b == 1) { int len = receive_extralen(ms); - if (len > DLM_RESNAME_MAXLEN) - len = DLM_RESNAME_MAXLEN; + if (len > r->res_ls->ls_lvblen) + len = r->res_ls->ls_lvblen; memcpy(lkb->lkb_lvbptr, ms->m_extra, len); lkb->lkb_lvbseq = ms->m_lvbseq; } @@ -3893,8 +3893,8 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, if (!lkb->lkb_lvbptr) return -ENOMEM; len = receive_extralen(ms); - if (len > DLM_RESNAME_MAXLEN) - len = DLM_RESNAME_MAXLEN; + if (len > ls->ls_lvblen) + len = ls->ls_lvblen; memcpy(lkb->lkb_lvbptr, ms->m_extra, len); } return 0; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 3ca79d3253b9..88556dc0458e 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -883,17 +883,24 @@ int dlm_release_lockspace(void *lockspace, int force) void dlm_stop_lockspaces(void) { struct dlm_ls *ls; + int count; restart: + count = 0; spin_lock(&lslist_lock); list_for_each_entry(ls, &lslist, ls_list) { - if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) + if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) { + count++; continue; + } spin_unlock(&lslist_lock); log_error(ls, "no userland control daemon, stopping lockspace"); dlm_ls_stop(ls); goto restart; } spin_unlock(&lslist_lock); + + if (count) + log_print("dlm user daemon left %d lockspaces", count); } diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 4f5ad246582f..d90909ec6aa6 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -53,7 +53,6 @@ #include <linux/sctp.h> #include <linux/slab.h> #include <net/sctp/sctp.h> -#include <net/sctp/user.h> #include <net/ipv6.h> #include "dlm_internal.h" @@ -126,6 +125,7 @@ struct connection { struct connection *othercon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ + bool try_new_addr; }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -144,6 +144,7 @@ struct dlm_node_addr { struct list_head list; int nodeid; int addr_count; + int curr_addr_index; struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; }; @@ -310,7 +311,7 @@ static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y) } static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, - struct sockaddr *sa_out) + struct sockaddr *sa_out, bool try_new_addr) { struct sockaddr_storage sas; struct dlm_node_addr *na; @@ -320,8 +321,16 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, spin_lock(&dlm_node_addrs_spin); na = find_node_addr(nodeid); - if (na && na->addr_count) - memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage)); + if (na && na->addr_count) { + if (try_new_addr) { + na->curr_addr_index++; + if (na->curr_addr_index == na->addr_count) + na->curr_addr_index = 0; + } + + memcpy(&sas, na->addr[na->curr_addr_index ], + sizeof(struct sockaddr_storage)); + } spin_unlock(&dlm_node_addrs_spin); if (!na) @@ -353,19 +362,22 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) { struct dlm_node_addr *na; int rv = -EEXIST; + int addr_i; spin_lock(&dlm_node_addrs_spin); list_for_each_entry(na, &dlm_node_addrs, list) { if (!na->addr_count) continue; - if (!addr_compare(na->addr[0], addr)) - continue; - - *nodeid = na->nodeid; - rv = 0; - break; + for (addr_i = 0; addr_i < na->addr_count; addr_i++) { + if (addr_compare(na->addr[addr_i], addr)) { + *nodeid = na->nodeid; + rv = 0; + goto unlock; + } + } } +unlock: spin_unlock(&dlm_node_addrs_spin); return rv; } @@ -561,8 +573,23 @@ static void sctp_send_shutdown(sctp_assoc_t associd) static void sctp_init_failed_foreach(struct connection *con) { + + /* + * Don't try to recover base con and handle race where the + * other node's assoc init creates a assoc and we get that + * notification, then we get a notification that our attempt + * failed due. This happens when we are still trying the primary + * address, but the other node has already tried secondary addrs + * and found one that worked. + */ + if (!con->nodeid || con->sctp_assoc) + return; + + log_print("Retrying SCTP association init for node %d\n", con->nodeid); + + con->try_new_addr = true; con->sctp_assoc = 0; - if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { + if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) { if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) queue_work(send_workqueue, &con->swork); } @@ -579,15 +606,56 @@ static void sctp_init_failed(void) mutex_unlock(&connections_lock); } +static void retry_failed_sctp_send(struct connection *recv_con, + struct sctp_send_failed *sn_send_failed, + char *buf) +{ + int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed); + struct dlm_mhandle *mh; + struct connection *con; + char *retry_buf; + int nodeid = sn_send_failed->ssf_info.sinfo_ppid; + + log_print("Retry sending %d bytes to node id %d", len, nodeid); + + con = nodeid2con(nodeid, 0); + if (!con) { + log_print("Could not look up con for nodeid %d\n", + nodeid); + return; + } + + mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf); + if (!mh) { + log_print("Could not allocate buf for retry."); + return; + } + memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len); + dlm_lowcomms_commit_buffer(mh); + + /* + * If we got a assoc changed event before the send failed event then + * we only need to retry the send. + */ + if (con->sctp_assoc) { + if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) + queue_work(send_workqueue, &con->swork); + } else + sctp_init_failed_foreach(con); +} + /* Something happened to an association */ static void process_sctp_notification(struct connection *con, struct msghdr *msg, char *buf) { union sctp_notification *sn = (union sctp_notification *)buf; - if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) { + switch (sn->sn_header.sn_type) { + case SCTP_SEND_FAILED: + retry_failed_sctp_send(con, &sn->sn_send_failed, buf); + break; + case SCTP_ASSOC_CHANGE: switch (sn->sn_assoc_change.sac_state) { - case SCTP_COMM_UP: case SCTP_RESTART: { @@ -662,9 +730,11 @@ static void process_sctp_notification(struct connection *con, log_print("connecting to %d sctp association %d", nodeid, (int)sn->sn_assoc_change.sac_assoc_id); + new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id; + new_con->try_new_addr = false; /* Send any pending writes */ clear_bit(CF_CONNECT_PENDING, &new_con->flags); - clear_bit(CF_INIT_PENDING, &con->flags); + clear_bit(CF_INIT_PENDING, &new_con->flags); if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) { queue_work(send_workqueue, &new_con->swork); } @@ -683,14 +753,10 @@ static void process_sctp_notification(struct connection *con, } break; - /* We don't know which INIT failed, so clear the PENDING flags - * on them all. if assoc_id is zero then it will then try - * again */ - case SCTP_CANT_STR_ASSOC: { + /* Will retry init when we get the send failed notification */ log_print("Can't start SCTP association - retrying"); - sctp_init_failed(); } break; @@ -699,6 +765,8 @@ static void process_sctp_notification(struct connection *con, (int)sn->sn_assoc_change.sac_assoc_id, sn->sn_assoc_change.sac_state); } + default: + ; /* fall through */ } } @@ -958,6 +1026,24 @@ static void free_entry(struct writequeue_entry *e) kfree(e); } +/* + * writequeue_entry_complete - try to delete and free write queue entry + * @e: write queue entry to try to delete + * @completed: bytes completed + * + * writequeue_lock must be held. + */ +static void writequeue_entry_complete(struct writequeue_entry *e, int completed) +{ + e->offset += completed; + e->len -= completed; + + if (e->len == 0 && e->users == 0) { + list_del(&e->list); + free_entry(e); + } +} + /* Initiate an SCTP association. This is a special case of send_to_sock() in that we don't yet have a peeled-off socket for this association, so we use the listening socket @@ -977,15 +1063,14 @@ static void sctp_init_assoc(struct connection *con) int addrlen; struct kvec iov[1]; + mutex_lock(&con->sock_mutex); if (test_and_set_bit(CF_INIT_PENDING, &con->flags)) - return; - - if (con->retries++ > MAX_CONNECT_RETRIES) - return; + goto unlock; - if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) { + if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr, + con->try_new_addr)) { log_print("no address for nodeid %d", con->nodeid); - return; + goto unlock; } base_con = nodeid2con(0, 0); BUG_ON(base_con == NULL); @@ -1003,17 +1088,25 @@ static void sctp_init_assoc(struct connection *con) if (list_empty(&con->writequeue)) { spin_unlock(&con->writequeue_lock); log_print("writequeue empty for nodeid %d", con->nodeid); - return; + goto unlock; } e = list_first_entry(&con->writequeue, struct writequeue_entry, list); len = e->len; offset = e->offset; - spin_unlock(&con->writequeue_lock); /* Send the first block off the write queue */ iov[0].iov_base = page_address(e->page)+offset; iov[0].iov_len = len; + spin_unlock(&con->writequeue_lock); + + if (rem_addr.ss_family == AF_INET) { + struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr; + log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr); + } else { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr; + log_print("Trying to connect to %pI6", &sin6->sin6_addr); + } cmsg = CMSG_FIRSTHDR(&outmessage); cmsg->cmsg_level = IPPROTO_SCTP; @@ -1021,8 +1114,9 @@ static void sctp_init_assoc(struct connection *con) cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); sinfo = CMSG_DATA(cmsg); memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); - sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid()); + sinfo->sinfo_ppid = cpu_to_le32(con->nodeid); outmessage.msg_controllen = cmsg->cmsg_len; + sinfo->sinfo_flags |= SCTP_ADDR_OVER; ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len); if (ret < 0) { @@ -1035,15 +1129,12 @@ static void sctp_init_assoc(struct connection *con) } else { spin_lock(&con->writequeue_lock); - e->offset += ret; - e->len -= ret; - - if (e->len == 0 && e->users == 0) { - list_del(&e->list); - free_entry(e); - } + writequeue_entry_complete(e, ret); spin_unlock(&con->writequeue_lock); } + +unlock: + mutex_unlock(&con->sock_mutex); } /* Connect a new socket to its peer */ @@ -1075,7 +1166,7 @@ static void tcp_connect_to_sock(struct connection *con) goto out_err; memset(&saddr, 0, sizeof(saddr)); - result = nodeid_to_addr(con->nodeid, &saddr, NULL); + result = nodeid_to_addr(con->nodeid, &saddr, NULL, false); if (result < 0) { log_print("no address for nodeid %d", con->nodeid); goto out_err; @@ -1254,6 +1345,7 @@ static int sctp_listen_for_all(void) int result = -EINVAL, num = 1, i, addr_len; struct connection *con = nodeid2con(0, GFP_NOFS); int bufsize = NEEDED_RMEM; + int one = 1; if (!con) return -ENOMEM; @@ -1288,6 +1380,11 @@ static int sctp_listen_for_all(void) goto create_delsock; } + result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, + sizeof(one)); + if (result < 0) + log_print("Could not set SCTP NODELAY error %d\n", result); + /* Init con struct */ sock->sk->sk_user_data = con; con->sock = sock; @@ -1493,13 +1590,7 @@ static void send_to_sock(struct connection *con) } spin_lock(&con->writequeue_lock); - e->offset += ret; - e->len -= ret; - - if (e->len == 0 && e->users == 0) { - list_del(&e->list); - free_entry(e); - } + writequeue_entry_complete(e, ret); } spin_unlock(&con->writequeue_lock); out: diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 01fd5c11a7fb..f704458ea5f5 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -247,6 +247,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, struct dlm_ls *ls; struct plock_op *op; int rv; + unsigned char fl_flags = fl->fl_flags; ls = dlm_find_lockspace_local(lockspace); if (!ls) @@ -258,9 +259,18 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, goto out; } - if (posix_lock_file_wait(file, fl) < 0) - log_error(ls, "dlm_posix_unlock: vfs unlock error %llx", - (unsigned long long)number); + /* cause the vfs unlock to return ENOENT if lock is not found */ + fl->fl_flags |= FL_EXISTS; + + rv = posix_lock_file_wait(file, fl); + if (rv == -ENOENT) { + rv = 0; + goto out_free; + } + if (rv < 0) { + log_error(ls, "dlm_posix_unlock: vfs unlock error %d %llx", + rv, (unsigned long long)number); + } op->info.optype = DLM_PLOCK_OP_UNLOCK; op->info.pid = fl->fl_pid; @@ -296,9 +306,11 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, if (rv == -ENOENT) rv = 0; +out_free: kfree(op); out: dlm_put_lockspace(ls); + fl->fl_flags = fl_flags; return rv; } EXPORT_SYMBOL_GPL(dlm_posix_unlock); diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index d5c25db4398f..d10757635b9c 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -37,16 +37,8 @@ #include <asm/unaligned.h> #include "ecryptfs_kernel.h" -static int -ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, - struct page *dst_page, int dst_offset, - struct page *src_page, int src_offset, int size, - unsigned char *iv); -static int -ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, - struct page *dst_page, int dst_offset, - struct page *src_page, int src_offset, int size, - unsigned char *iv); +#define DECRYPT 0 +#define ENCRYPT 1 /** * ecryptfs_to_hex @@ -243,7 +235,7 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) struct ecryptfs_key_sig *key_sig, *key_sig_tmp; if (crypt_stat->tfm) - crypto_free_blkcipher(crypt_stat->tfm); + crypto_free_ablkcipher(crypt_stat->tfm); if (crypt_stat->hash_tfm) crypto_free_hash(crypt_stat->hash_tfm); list_for_each_entry_safe(key_sig, key_sig_tmp, @@ -319,26 +311,40 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, return i; } +struct extent_crypt_result { + struct completion completion; + int rc; +}; + +static void extent_crypt_complete(struct crypto_async_request *req, int rc) +{ + struct extent_crypt_result *ecr = req->data; + + if (rc == -EINPROGRESS) + return; + + ecr->rc = rc; + complete(&ecr->completion); +} + /** - * encrypt_scatterlist + * crypt_scatterlist * @crypt_stat: Pointer to the crypt_stat struct to initialize. - * @dest_sg: Destination of encrypted data - * @src_sg: Data to be encrypted - * @size: Length of data to be encrypted - * @iv: iv to use during encryption + * @dst_sg: Destination of the data after performing the crypto operation + * @src_sg: Data to be encrypted or decrypted + * @size: Length of data + * @iv: IV to use + * @op: ENCRYPT or DECRYPT to indicate the desired operation * - * Returns the number of bytes encrypted; negative value on error + * Returns the number of bytes encrypted or decrypted; negative value on error */ -static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, - struct scatterlist *dest_sg, - struct scatterlist *src_sg, int size, - unsigned char *iv) +static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, + struct scatterlist *dst_sg, + struct scatterlist *src_sg, int size, + unsigned char *iv, int op) { - struct blkcipher_desc desc = { - .tfm = crypt_stat->tfm, - .info = iv, - .flags = CRYPTO_TFM_REQ_MAY_SLEEP - }; + struct ablkcipher_request *req = NULL; + struct extent_crypt_result ecr; int rc = 0; BUG_ON(!crypt_stat || !crypt_stat->tfm @@ -349,63 +355,88 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, ecryptfs_dump_hex(crypt_stat->key, crypt_stat->key_size); } - /* Consider doing this once, when the file is opened */ + + init_completion(&ecr.completion); + mutex_lock(&crypt_stat->cs_tfm_mutex); - if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) { - rc = crypto_blkcipher_setkey(crypt_stat->tfm, crypt_stat->key, - crypt_stat->key_size); - crypt_stat->flags |= ECRYPTFS_KEY_SET; - } - if (rc) { - ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n", - rc); + req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS); + if (!req) { mutex_unlock(&crypt_stat->cs_tfm_mutex); - rc = -EINVAL; + rc = -ENOMEM; goto out; } - ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size); - crypto_blkcipher_encrypt_iv(&desc, dest_sg, src_sg, size); + + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + extent_crypt_complete, &ecr); + /* Consider doing this once, when the file is opened */ + if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) { + rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key, + crypt_stat->key_size); + if (rc) { + ecryptfs_printk(KERN_ERR, + "Error setting key; rc = [%d]\n", + rc); + mutex_unlock(&crypt_stat->cs_tfm_mutex); + rc = -EINVAL; + goto out; + } + crypt_stat->flags |= ECRYPTFS_KEY_SET; + } mutex_unlock(&crypt_stat->cs_tfm_mutex); + ablkcipher_request_set_crypt(req, src_sg, dst_sg, size, iv); + rc = op == ENCRYPT ? crypto_ablkcipher_encrypt(req) : + crypto_ablkcipher_decrypt(req); + if (rc == -EINPROGRESS || rc == -EBUSY) { + struct extent_crypt_result *ecr = req->base.data; + + wait_for_completion(&ecr->completion); + rc = ecr->rc; + INIT_COMPLETION(ecr->completion); + } out: + ablkcipher_request_free(req); return rc; } /** - * ecryptfs_lower_offset_for_extent + * lower_offset_for_page * * Convert an eCryptfs page index into a lower byte offset */ -static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num, - struct ecryptfs_crypt_stat *crypt_stat) +static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat, + struct page *page) { - (*offset) = ecryptfs_lower_header_size(crypt_stat) - + (crypt_stat->extent_size * extent_num); + return ecryptfs_lower_header_size(crypt_stat) + + (page->index << PAGE_CACHE_SHIFT); } /** - * ecryptfs_encrypt_extent - * @enc_extent_page: Allocated page into which to encrypt the data in - * @page + * crypt_extent * @crypt_stat: crypt_stat containing cryptographic context for the * encryption operation - * @page: Page containing plaintext data extent to encrypt + * @dst_page: The page to write the result into + * @src_page: The page to read from * @extent_offset: Page extent offset for use in generating IV + * @op: ENCRYPT or DECRYPT to indicate the desired operation * - * Encrypts one extent of data. + * Encrypts or decrypts one extent of data. * * Return zero on success; non-zero otherwise */ -static int ecryptfs_encrypt_extent(struct page *enc_extent_page, - struct ecryptfs_crypt_stat *crypt_stat, - struct page *page, - unsigned long extent_offset) +static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat, + struct page *dst_page, + struct page *src_page, + unsigned long extent_offset, int op) { + pgoff_t page_index = op == ENCRYPT ? src_page->index : dst_page->index; loff_t extent_base; char extent_iv[ECRYPTFS_MAX_IV_BYTES]; + struct scatterlist src_sg, dst_sg; + size_t extent_size = crypt_stat->extent_size; int rc; - extent_base = (((loff_t)page->index) - * (PAGE_CACHE_SIZE / crypt_stat->extent_size)); + extent_base = (((loff_t)page_index) * (PAGE_CACHE_SIZE / extent_size)); rc = ecryptfs_derive_iv(extent_iv, crypt_stat, (extent_base + extent_offset)); if (rc) { @@ -414,15 +445,21 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page, (unsigned long long)(extent_base + extent_offset), rc); goto out; } - rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0, - page, (extent_offset - * crypt_stat->extent_size), - crypt_stat->extent_size, extent_iv); + + sg_init_table(&src_sg, 1); + sg_init_table(&dst_sg, 1); + + sg_set_page(&src_sg, src_page, extent_size, + extent_offset * extent_size); + sg_set_page(&dst_sg, dst_page, extent_size, + extent_offset * extent_size); + + rc = crypt_scatterlist(crypt_stat, &dst_sg, &src_sg, extent_size, + extent_iv, op); if (rc < 0) { - printk(KERN_ERR "%s: Error attempting to encrypt page with " - "page->index = [%ld], extent_offset = [%ld]; " - "rc = [%d]\n", __func__, page->index, extent_offset, - rc); + printk(KERN_ERR "%s: Error attempting to crypt page with " + "page_index = [%ld], extent_offset = [%ld]; " + "rc = [%d]\n", __func__, page_index, extent_offset, rc); goto out; } rc = 0; @@ -453,6 +490,7 @@ int ecryptfs_encrypt_page(struct page *page) char *enc_extent_virt; struct page *enc_extent_page = NULL; loff_t extent_offset; + loff_t lower_offset; int rc = 0; ecryptfs_inode = page->mapping->host; @@ -466,75 +504,35 @@ int ecryptfs_encrypt_page(struct page *page) "encrypted extent\n"); goto out; } - enc_extent_virt = kmap(enc_extent_page); + for (extent_offset = 0; extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); extent_offset++) { - loff_t offset; - - rc = ecryptfs_encrypt_extent(enc_extent_page, crypt_stat, page, - extent_offset); + rc = crypt_extent(crypt_stat, enc_extent_page, page, + extent_offset, ENCRYPT); if (rc) { printk(KERN_ERR "%s: Error encrypting extent; " "rc = [%d]\n", __func__, rc); goto out; } - ecryptfs_lower_offset_for_extent( - &offset, ((((loff_t)page->index) - * (PAGE_CACHE_SIZE - / crypt_stat->extent_size)) - + extent_offset), crypt_stat); - rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, - offset, crypt_stat->extent_size); - if (rc < 0) { - ecryptfs_printk(KERN_ERR, "Error attempting " - "to write lower page; rc = [%d]" - "\n", rc); - goto out; - } } - rc = 0; -out: - if (enc_extent_page) { - kunmap(enc_extent_page); - __free_page(enc_extent_page); - } - return rc; -} -static int ecryptfs_decrypt_extent(struct page *page, - struct ecryptfs_crypt_stat *crypt_stat, - struct page *enc_extent_page, - unsigned long extent_offset) -{ - loff_t extent_base; - char extent_iv[ECRYPTFS_MAX_IV_BYTES]; - int rc; - - extent_base = (((loff_t)page->index) - * (PAGE_CACHE_SIZE / crypt_stat->extent_size)); - rc = ecryptfs_derive_iv(extent_iv, crypt_stat, - (extent_base + extent_offset)); - if (rc) { - ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for " - "extent [0x%.16llx]; rc = [%d]\n", - (unsigned long long)(extent_base + extent_offset), rc); - goto out; - } - rc = ecryptfs_decrypt_page_offset(crypt_stat, page, - (extent_offset - * crypt_stat->extent_size), - enc_extent_page, 0, - crypt_stat->extent_size, extent_iv); + lower_offset = lower_offset_for_page(crypt_stat, page); + enc_extent_virt = kmap(enc_extent_page); + rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset, + PAGE_CACHE_SIZE); + kunmap(enc_extent_page); if (rc < 0) { - printk(KERN_ERR "%s: Error attempting to decrypt to page with " - "page->index = [%ld], extent_offset = [%ld]; " - "rc = [%d]\n", __func__, page->index, extent_offset, - rc); + ecryptfs_printk(KERN_ERR, + "Error attempting to write lower page; rc = [%d]\n", + rc); goto out; } rc = 0; out: + if (enc_extent_page) { + __free_page(enc_extent_page); + } return rc; } @@ -558,43 +556,33 @@ int ecryptfs_decrypt_page(struct page *page) { struct inode *ecryptfs_inode; struct ecryptfs_crypt_stat *crypt_stat; - char *enc_extent_virt; - struct page *enc_extent_page = NULL; + char *page_virt; unsigned long extent_offset; + loff_t lower_offset; int rc = 0; ecryptfs_inode = page->mapping->host; crypt_stat = &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); - enc_extent_page = alloc_page(GFP_USER); - if (!enc_extent_page) { - rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, "Error allocating memory for " - "encrypted extent\n"); + + lower_offset = lower_offset_for_page(crypt_stat, page); + page_virt = kmap(page); + rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_CACHE_SIZE, + ecryptfs_inode); + kunmap(page); + if (rc < 0) { + ecryptfs_printk(KERN_ERR, + "Error attempting to read lower page; rc = [%d]\n", + rc); goto out; } - enc_extent_virt = kmap(enc_extent_page); + for (extent_offset = 0; extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); extent_offset++) { - loff_t offset; - - ecryptfs_lower_offset_for_extent( - &offset, ((page->index * (PAGE_CACHE_SIZE - / crypt_stat->extent_size)) - + extent_offset), crypt_stat); - rc = ecryptfs_read_lower(enc_extent_virt, offset, - crypt_stat->extent_size, - ecryptfs_inode); - if (rc < 0) { - ecryptfs_printk(KERN_ERR, "Error attempting " - "to read lower page; rc = [%d]" - "\n", rc); - goto out; - } - rc = ecryptfs_decrypt_extent(page, crypt_stat, enc_extent_page, - extent_offset); + rc = crypt_extent(crypt_stat, page, page, + extent_offset, DECRYPT); if (rc) { printk(KERN_ERR "%s: Error encrypting extent; " "rc = [%d]\n", __func__, rc); @@ -602,116 +590,9 @@ int ecryptfs_decrypt_page(struct page *page) } } out: - if (enc_extent_page) { - kunmap(enc_extent_page); - __free_page(enc_extent_page); - } - return rc; -} - -/** - * decrypt_scatterlist - * @crypt_stat: Cryptographic context - * @dest_sg: The destination scatterlist to decrypt into - * @src_sg: The source scatterlist to decrypt from - * @size: The number of bytes to decrypt - * @iv: The initialization vector to use for the decryption - * - * Returns the number of bytes decrypted; negative value on error - */ -static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, - struct scatterlist *dest_sg, - struct scatterlist *src_sg, int size, - unsigned char *iv) -{ - struct blkcipher_desc desc = { - .tfm = crypt_stat->tfm, - .info = iv, - .flags = CRYPTO_TFM_REQ_MAY_SLEEP - }; - int rc = 0; - - /* Consider doing this once, when the file is opened */ - mutex_lock(&crypt_stat->cs_tfm_mutex); - rc = crypto_blkcipher_setkey(crypt_stat->tfm, crypt_stat->key, - crypt_stat->key_size); - if (rc) { - ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n", - rc); - mutex_unlock(&crypt_stat->cs_tfm_mutex); - rc = -EINVAL; - goto out; - } - ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size); - rc = crypto_blkcipher_decrypt_iv(&desc, dest_sg, src_sg, size); - mutex_unlock(&crypt_stat->cs_tfm_mutex); - if (rc) { - ecryptfs_printk(KERN_ERR, "Error decrypting; rc = [%d]\n", - rc); - goto out; - } - rc = size; -out: return rc; } -/** - * ecryptfs_encrypt_page_offset - * @crypt_stat: The cryptographic context - * @dst_page: The page to encrypt into - * @dst_offset: The offset in the page to encrypt into - * @src_page: The page to encrypt from - * @src_offset: The offset in the page to encrypt from - * @size: The number of bytes to encrypt - * @iv: The initialization vector to use for the encryption - * - * Returns the number of bytes encrypted - */ -static int -ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, - struct page *dst_page, int dst_offset, - struct page *src_page, int src_offset, int size, - unsigned char *iv) -{ - struct scatterlist src_sg, dst_sg; - - sg_init_table(&src_sg, 1); - sg_init_table(&dst_sg, 1); - - sg_set_page(&src_sg, src_page, size, src_offset); - sg_set_page(&dst_sg, dst_page, size, dst_offset); - return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv); -} - -/** - * ecryptfs_decrypt_page_offset - * @crypt_stat: The cryptographic context - * @dst_page: The page to decrypt into - * @dst_offset: The offset in the page to decrypt into - * @src_page: The page to decrypt from - * @src_offset: The offset in the page to decrypt from - * @size: The number of bytes to decrypt - * @iv: The initialization vector to use for the decryption - * - * Returns the number of bytes decrypted - */ -static int -ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, - struct page *dst_page, int dst_offset, - struct page *src_page, int src_offset, int size, - unsigned char *iv) -{ - struct scatterlist src_sg, dst_sg; - - sg_init_table(&src_sg, 1); - sg_set_page(&src_sg, src_page, size, src_offset); - - sg_init_table(&dst_sg, 1); - sg_set_page(&dst_sg, dst_page, size, dst_offset); - - return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv); -} - #define ECRYPTFS_MAX_SCATTERLIST_LEN 4 /** @@ -746,8 +627,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat) crypt_stat->cipher, "cbc"); if (rc) goto out_unlock; - crypt_stat->tfm = crypto_alloc_blkcipher(full_alg_name, 0, - CRYPTO_ALG_ASYNC); + crypt_stat->tfm = crypto_alloc_ablkcipher(full_alg_name, 0, 0); kfree(full_alg_name); if (IS_ERR(crypt_stat->tfm)) { rc = PTR_ERR(crypt_stat->tfm); @@ -757,7 +637,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat) crypt_stat->cipher); goto out_unlock; } - crypto_blkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY); + crypto_ablkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY); rc = 0; out_unlock: mutex_unlock(&crypt_stat->cs_tfm_mutex); @@ -2182,12 +2062,11 @@ out: */ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, size_t *plaintext_name_size, - struct dentry *ecryptfs_dir_dentry, + struct super_block *sb, const char *name, size_t name_size) { struct ecryptfs_mount_crypt_stat *mount_crypt_stat = - &ecryptfs_superblock_to_private( - ecryptfs_dir_dentry->d_sb)->mount_crypt_stat; + &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; char *decoded_name; size_t decoded_name_size; size_t packet_size; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index dd299b389d4e..df19d34a033b 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -38,6 +38,7 @@ #include <linux/nsproxy.h> #include <linux/backing-dev.h> #include <linux/ecryptfs.h> +#include <linux/crypto.h> #define ECRYPTFS_DEFAULT_IV_BYTES 16 #define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096 @@ -233,7 +234,7 @@ struct ecryptfs_crypt_stat { size_t extent_shift; unsigned int extent_mask; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; - struct crypto_blkcipher *tfm; + struct crypto_ablkcipher *tfm; struct crypto_hash *hash_tfm; /* Crypto context for generating * the initialization vectors */ unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; @@ -574,7 +575,7 @@ int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, struct inode *ecryptfs_inode); int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, size_t *decrypted_name_size, - struct dentry *ecryptfs_dentry, + struct super_block *sb, const char *name, size_t name_size); int ecryptfs_fill_zeros(struct file *file, loff_t new_length); int ecryptfs_encrypt_and_encode_filename( diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 63b1f54b6a1f..992cf95830b5 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -31,6 +31,7 @@ #include <linux/security.h> #include <linux/compat.h> #include <linux/fs_stack.h> +#include <linux/aio.h> #include "ecryptfs_kernel.h" /** @@ -48,7 +49,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, unsigned long nr_segs, loff_t pos) { ssize_t rc; - struct path lower; + struct path *path; struct file *file = iocb->ki_filp; rc = generic_file_aio_read(iocb, iov, nr_segs, pos); @@ -59,17 +60,16 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, if (-EIOCBQUEUED == rc) rc = wait_on_sync_kiocb(iocb); if (rc >= 0) { - lower.dentry = ecryptfs_dentry_to_lower(file->f_path.dentry); - lower.mnt = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry); - touch_atime(&lower); + path = ecryptfs_dentry_to_lower_path(file->f_path.dentry); + touch_atime(path); } return rc; } struct ecryptfs_getdents_callback { - void *dirent; - struct dentry *dentry; - filldir_t filldir; + struct dir_context ctx; + struct dir_context *caller; + struct super_block *sb; int filldir_called; int entries_written; }; @@ -87,7 +87,7 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen, buf->filldir_called++; rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size, - buf->dentry, lower_name, + buf->sb, lower_name, lower_namelen); if (rc) { printk(KERN_ERR "%s: Error attempting to decode and decrypt " @@ -95,9 +95,10 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen, rc); goto out; } - rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type); + buf->caller->pos = buf->ctx.pos; + rc = !dir_emit(buf->caller, name, name_size, ino, d_type); kfree(name); - if (rc >= 0) + if (!rc) buf->entries_written++; out: return rc; @@ -106,27 +107,22 @@ out: /** * ecryptfs_readdir * @file: The eCryptfs directory file - * @dirent: Directory entry handle - * @filldir: The filldir callback function + * @ctx: The actor to feed the entries to */ -static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) +static int ecryptfs_readdir(struct file *file, struct dir_context *ctx) { int rc; struct file *lower_file; - struct inode *inode; - struct ecryptfs_getdents_callback buf; - + struct inode *inode = file_inode(file); + struct ecryptfs_getdents_callback buf = { + .ctx.actor = ecryptfs_filldir, + .caller = ctx, + .sb = inode->i_sb, + }; lower_file = ecryptfs_file_to_lower(file); - lower_file->f_pos = file->f_pos; - inode = file_inode(file); - memset(&buf, 0, sizeof(buf)); - buf.dirent = dirent; - buf.dentry = file->f_path.dentry; - buf.filldir = filldir; - buf.filldir_called = 0; - buf.entries_written = 0; - rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf); - file->f_pos = lower_file->f_pos; + lower_file->f_pos = ctx->pos; + rc = iterate_dir(lower_file, &buf.ctx); + ctx->pos = buf.ctx.pos; if (rc < 0) goto out; if (buf.filldir_called && !buf.entries_written) @@ -294,6 +290,12 @@ static int ecryptfs_release(struct inode *inode, struct file *file) static int ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { + int rc; + + rc = filemap_write_and_wait(file->f_mapping); + if (rc) + return rc; + return vfs_fsync(ecryptfs_file_to_lower(file), datasync); } @@ -337,7 +339,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) #endif const struct file_operations ecryptfs_dir_fops = { - .readdir = ecryptfs_readdir, + .iterate = ecryptfs_readdir, .read = generic_read_dir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT @@ -358,7 +360,7 @@ const struct file_operations ecryptfs_main_fops = { .aio_read = ecryptfs_read_update_atime, .write = do_sync_write, .aio_write = generic_file_aio_write, - .readdir = ecryptfs_readdir, + .iterate = ecryptfs_readdir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 5eab400e2590..67e9b6339691 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -358,7 +358,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); - BUG_ON(!lower_dentry->d_count); + BUG_ON(!d_count(lower_dentry)); ecryptfs_set_dentry_private(dentry, dentry_info); ecryptfs_set_dentry_lower(dentry, lower_dentry); @@ -679,7 +679,7 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf, set_fs(old_fs); if (rc < 0) goto out; - rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry, + rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry->d_sb, lower_buf, rc); out: kfree(lower_buf); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index e924cf45aad9..eb1c5979ecaf 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -120,16 +120,15 @@ static int ecryptfs_init_lower_file(struct dentry *dentry, struct file **lower_file) { const struct cred *cred = current_cred(); - struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); + struct path *path = ecryptfs_dentry_to_lower_path(dentry); int rc; - rc = ecryptfs_privileged_open(lower_file, lower_dentry, lower_mnt, + rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt, cred); if (rc) { printk(KERN_ERR "Error opening lower file " "for lower_dentry [0x%p] and lower_mnt [0x%p]; " - "rc = [%d]\n", lower_dentry, lower_mnt, rc); + "rc = [%d]\n", path->dentry, path->mnt, rc); (*lower_file) = NULL; } return rc; diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 49ff8ea08f1c..e57380e5f6bd 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -247,14 +247,13 @@ int ecryptfs_process_response(struct ecryptfs_daemon *daemon, goto unlock; } msg_size = (sizeof(*msg) + msg->data_len); - msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL); + msg_ctx->msg = kmemdup(msg, msg_size, GFP_KERNEL); if (!msg_ctx->msg) { rc = -ENOMEM; printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " "GFP_KERNEL memory\n", __func__, msg_size); goto unlock; } - memcpy(msg_ctx->msg, msg, msg_size); msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE; wake_up_process(msg_ctx->task); rc = 0; diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 6a160539cd23..09fe622274e4 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -232,17 +232,10 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size, struct inode *ecryptfs_inode) { struct file *lower_file; - mm_segment_t fs_save; - ssize_t rc; - lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file; if (!lower_file) return -EIO; - fs_save = get_fs(); - set_fs(get_ds()); - rc = vfs_read(lower_file, data, size, &offset); - set_fs(fs_save); - return rc; + return kernel_read(lower_file, offset, data, size); } /** diff --git a/fs/efivarfs/Kconfig b/fs/efivarfs/Kconfig new file mode 100644 index 000000000000..367bbb10c543 --- /dev/null +++ b/fs/efivarfs/Kconfig @@ -0,0 +1,12 @@ +config EFIVAR_FS + tristate "EFI Variable filesystem" + depends on EFI + help + efivarfs is a replacement filesystem for the old EFI + variable support via sysfs, as it doesn't suffer from the + same 1024-byte variable size limit. + + To compile this file system support as a module, choose M + here. The module will be called efivarfs. + + If unsure, say N. diff --git a/fs/efivarfs/Makefile b/fs/efivarfs/Makefile new file mode 100644 index 000000000000..955d478177d5 --- /dev/null +++ b/fs/efivarfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the efivarfs filesystem +# + +obj-$(CONFIG_EFIVAR_FS) += efivarfs.o + +efivarfs-objs := inode.o file.o super.o diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c new file mode 100644 index 000000000000..8dd524f32284 --- /dev/null +++ b/fs/efivarfs/file.c @@ -0,0 +1,116 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/efi.h> +#include <linux/fs.h> +#include <linux/slab.h> + +#include "internal.h" + +static ssize_t efivarfs_file_write(struct file *file, + const char __user *userbuf, size_t count, loff_t *ppos) +{ + struct efivar_entry *var = file->private_data; + void *data; + u32 attributes; + struct inode *inode = file->f_mapping->host; + unsigned long datasize = count - sizeof(attributes); + ssize_t bytes = 0; + bool set = false; + + if (count < sizeof(attributes)) + return -EINVAL; + + if (copy_from_user(&attributes, userbuf, sizeof(attributes))) + return -EFAULT; + + if (attributes & ~(EFI_VARIABLE_MASK)) + return -EINVAL; + + data = kmalloc(datasize, GFP_KERNEL); + if (!data) + return -ENOMEM; + + if (copy_from_user(data, userbuf + sizeof(attributes), datasize)) { + bytes = -EFAULT; + goto out; + } + + bytes = efivar_entry_set_get_size(var, attributes, &datasize, + data, &set); + if (!set && bytes) { + if (bytes == -ENOENT) + bytes = -EIO; + goto out; + } + + if (bytes == -ENOENT) { + drop_nlink(inode); + d_delete(file->f_dentry); + dput(file->f_dentry); + } else { + mutex_lock(&inode->i_mutex); + i_size_write(inode, datasize + sizeof(attributes)); + mutex_unlock(&inode->i_mutex); + } + + bytes = count; + +out: + kfree(data); + + return bytes; +} + +static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct efivar_entry *var = file->private_data; + unsigned long datasize = 0; + u32 attributes; + void *data; + ssize_t size = 0; + int err; + + err = efivar_entry_size(var, &datasize); + + /* + * efivarfs represents uncommitted variables with + * zero-length files. Reading them should return EOF. + */ + if (err == -ENOENT) + return 0; + else if (err) + return err; + + data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL); + + if (!data) + return -ENOMEM; + + size = efivar_entry_get(var, &attributes, &datasize, + data + sizeof(attributes)); + if (size) + goto out_free; + + memcpy(data, &attributes, sizeof(attributes)); + size = simple_read_from_buffer(userbuf, count, ppos, + data, datasize + sizeof(attributes)); +out_free: + kfree(data); + + return size; +} + +const struct file_operations efivarfs_file_operations = { + .open = simple_open, + .read = efivarfs_file_read, + .write = efivarfs_file_write, + .llseek = no_llseek, +}; diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c new file mode 100644 index 000000000000..7e787fb90293 --- /dev/null +++ b/fs/efivarfs/inode.c @@ -0,0 +1,174 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/efi.h> +#include <linux/fs.h> +#include <linux/ctype.h> +#include <linux/slab.h> + +#include "internal.h" + +struct inode *efivarfs_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev) +{ + struct inode *inode = new_inode(sb); + + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_fop = &efivarfs_file_operations; + break; + case S_IFDIR: + inode->i_op = &efivarfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inc_nlink(inode); + break; + } + } + return inode; +} + +/* + * Return true if 'str' is a valid efivarfs filename of the form, + * + * VariableName-12345678-1234-1234-1234-1234567891bc + */ +bool efivarfs_valid_name(const char *str, int len) +{ + static const char dashes[EFI_VARIABLE_GUID_LEN] = { + [8] = 1, [13] = 1, [18] = 1, [23] = 1 + }; + const char *s = str + len - EFI_VARIABLE_GUID_LEN; + int i; + + /* + * We need a GUID, plus at least one letter for the variable name, + * plus the '-' separator + */ + if (len < EFI_VARIABLE_GUID_LEN + 2) + return false; + + /* GUID must be preceded by a '-' */ + if (*(s - 1) != '-') + return false; + + /* + * Validate that 's' is of the correct format, e.g. + * + * 12345678-1234-1234-1234-123456789abc + */ + for (i = 0; i < EFI_VARIABLE_GUID_LEN; i++) { + if (dashes[i]) { + if (*s++ != '-') + return false; + } else { + if (!isxdigit(*s++)) + return false; + } + } + + return true; +} + +static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid) +{ + guid->b[0] = hex_to_bin(str[6]) << 4 | hex_to_bin(str[7]); + guid->b[1] = hex_to_bin(str[4]) << 4 | hex_to_bin(str[5]); + guid->b[2] = hex_to_bin(str[2]) << 4 | hex_to_bin(str[3]); + guid->b[3] = hex_to_bin(str[0]) << 4 | hex_to_bin(str[1]); + guid->b[4] = hex_to_bin(str[11]) << 4 | hex_to_bin(str[12]); + guid->b[5] = hex_to_bin(str[9]) << 4 | hex_to_bin(str[10]); + guid->b[6] = hex_to_bin(str[16]) << 4 | hex_to_bin(str[17]); + guid->b[7] = hex_to_bin(str[14]) << 4 | hex_to_bin(str[15]); + guid->b[8] = hex_to_bin(str[19]) << 4 | hex_to_bin(str[20]); + guid->b[9] = hex_to_bin(str[21]) << 4 | hex_to_bin(str[22]); + guid->b[10] = hex_to_bin(str[24]) << 4 | hex_to_bin(str[25]); + guid->b[11] = hex_to_bin(str[26]) << 4 | hex_to_bin(str[27]); + guid->b[12] = hex_to_bin(str[28]) << 4 | hex_to_bin(str[29]); + guid->b[13] = hex_to_bin(str[30]) << 4 | hex_to_bin(str[31]); + guid->b[14] = hex_to_bin(str[32]) << 4 | hex_to_bin(str[33]); + guid->b[15] = hex_to_bin(str[34]) << 4 | hex_to_bin(str[35]); +} + +static int efivarfs_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) +{ + struct inode *inode; + struct efivar_entry *var; + int namelen, i = 0, err = 0; + + if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len)) + return -EINVAL; + + inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0); + if (!inode) + return -ENOMEM; + + var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL); + if (!var) { + err = -ENOMEM; + goto out; + } + + /* length of the variable name itself: remove GUID and separator */ + namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1; + + efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1, + &var->var.VendorGuid); + + for (i = 0; i < namelen; i++) + var->var.VariableName[i] = dentry->d_name.name[i]; + + var->var.VariableName[i] = '\0'; + + inode->i_private = var; + + efivar_entry_add(var, &efivarfs_list); + d_instantiate(dentry, inode); + dget(dentry); +out: + if (err) { + kfree(var); + iput(inode); + } + return err; +} + +static int efivarfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct efivar_entry *var = dentry->d_inode->i_private; + + if (efivar_entry_delete(var)) + return -EINVAL; + + drop_nlink(dentry->d_inode); + dput(dentry); + return 0; +}; + +/* + * Handle negative dentry. + */ +static struct dentry *efivarfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + d_add(dentry, NULL); + return NULL; +} + +const struct inode_operations efivarfs_dir_inode_operations = { + .lookup = efivarfs_lookup, + .unlink = efivarfs_unlink, + .create = efivarfs_create, +}; diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h new file mode 100644 index 000000000000..b5ff16addb7c --- /dev/null +++ b/fs/efivarfs/internal.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef EFIVAR_FS_INTERNAL_H +#define EFIVAR_FS_INTERNAL_H + +#include <linux/list.h> + +extern const struct file_operations efivarfs_file_operations; +extern const struct inode_operations efivarfs_dir_inode_operations; +extern bool efivarfs_valid_name(const char *str, int len); +extern struct inode *efivarfs_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev); + +extern struct list_head efivarfs_list; + +#endif /* EFIVAR_FS_INTERNAL_H */ diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c new file mode 100644 index 000000000000..a8766b880c07 --- /dev/null +++ b/fs/efivarfs/super.c @@ -0,0 +1,269 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/ctype.h> +#include <linux/efi.h> +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/pagemap.h> +#include <linux/ucs2_string.h> +#include <linux/slab.h> +#include <linux/magic.h> + +#include "internal.h" + +LIST_HEAD(efivarfs_list); + +static void efivarfs_evict_inode(struct inode *inode) +{ + clear_inode(inode); +} + +static const struct super_operations efivarfs_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = efivarfs_evict_inode, + .show_options = generic_show_options, +}; + +static struct super_block *efivarfs_sb; + +/* + * Compare two efivarfs file names. + * + * An efivarfs filename is composed of two parts, + * + * 1. A case-sensitive variable name + * 2. A case-insensitive GUID + * + * So we need to perform a case-sensitive match on part 1 and a + * case-insensitive match on part 2. + */ +static int efivarfs_d_compare(const struct dentry *parent, + const struct dentry *dentry, + unsigned int len, const char *str, + const struct qstr *name) +{ + int guid = len - EFI_VARIABLE_GUID_LEN; + + if (name->len != len) + return 1; + + /* Case-sensitive compare for the variable name */ + if (memcmp(str, name->name, guid)) + return 1; + + /* Case-insensitive compare for the GUID */ + return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN); +} + +static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) +{ + unsigned long hash = init_name_hash(); + const unsigned char *s = qstr->name; + unsigned int len = qstr->len; + + if (!efivarfs_valid_name(s, len)) + return -EINVAL; + + while (len-- > EFI_VARIABLE_GUID_LEN) + hash = partial_name_hash(*s++, hash); + + /* GUID is case-insensitive. */ + while (len--) + hash = partial_name_hash(tolower(*s++), hash); + + qstr->hash = end_name_hash(hash); + return 0; +} + +/* + * Retaining negative dentries for an in-memory filesystem just wastes + * memory and lookup time: arrange for them to be deleted immediately. + */ +static int efivarfs_delete_dentry(const struct dentry *dentry) +{ + return 1; +} + +static struct dentry_operations efivarfs_d_ops = { + .d_compare = efivarfs_d_compare, + .d_hash = efivarfs_d_hash, + .d_delete = efivarfs_delete_dentry, +}; + +static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) +{ + struct dentry *d; + struct qstr q; + int err; + + q.name = name; + q.len = strlen(name); + + err = efivarfs_d_hash(NULL, &q); + if (err) + return ERR_PTR(err); + + d = d_alloc(parent, &q); + if (d) + return d; + + return ERR_PTR(-ENOMEM); +} + +static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, + unsigned long name_size, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct efivar_entry *entry; + struct inode *inode = NULL; + struct dentry *dentry, *root = sb->s_root; + unsigned long size = 0; + char *name; + int len, i; + int err = -ENOMEM; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return err; + + memcpy(entry->var.VariableName, name16, name_size); + memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t)); + + len = ucs2_strlen(entry->var.VariableName); + + /* name, plus '-', plus GUID, plus NUL*/ + name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL); + if (!name) + goto fail; + + for (i = 0; i < len; i++) + name[i] = entry->var.VariableName[i] & 0xFF; + + name[len] = '-'; + + efi_guid_unparse(&entry->var.VendorGuid, name + len + 1); + + name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; + + inode = efivarfs_get_inode(sb, root->d_inode, S_IFREG | 0644, 0); + if (!inode) + goto fail_name; + + dentry = efivarfs_alloc_dentry(root, name); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto fail_inode; + } + + /* copied by the above to local storage in the dentry. */ + kfree(name); + + efivar_entry_size(entry, &size); + efivar_entry_add(entry, &efivarfs_list); + + mutex_lock(&inode->i_mutex); + inode->i_private = entry; + i_size_write(inode, size + sizeof(entry->var.Attributes)); + mutex_unlock(&inode->i_mutex); + d_add(dentry, inode); + + return 0; + +fail_inode: + iput(inode); +fail_name: + kfree(name); +fail: + kfree(entry); + return err; +} + +static int efivarfs_destroy(struct efivar_entry *entry, void *data) +{ + efivar_entry_remove(entry); + kfree(entry); + return 0; +} + +static int efivarfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode = NULL; + struct dentry *root; + int err; + + efivarfs_sb = sb; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = EFIVARFS_MAGIC; + sb->s_op = &efivarfs_ops; + sb->s_d_op = &efivarfs_d_ops; + sb->s_time_gran = 1; + + inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0); + if (!inode) + return -ENOMEM; + inode->i_op = &efivarfs_dir_inode_operations; + + root = d_make_root(inode); + sb->s_root = root; + if (!root) + return -ENOMEM; + + INIT_LIST_HEAD(&efivarfs_list); + + err = efivar_init(efivarfs_callback, (void *)sb, false, + true, &efivarfs_list); + if (err) + __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); + + return err; +} + +static struct dentry *efivarfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, efivarfs_fill_super); +} + +static void efivarfs_kill_sb(struct super_block *sb) +{ + kill_litter_super(sb); + efivarfs_sb = NULL; + + /* Remove all entries and destroy */ + __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); +} + +static struct file_system_type efivarfs_type = { + .name = "efivarfs", + .mount = efivarfs_mount, + .kill_sb = efivarfs_kill_sb, +}; + +static __init int efivarfs_init(void) +{ + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return 0; + + if (!efivars_kobject()) + return 0; + + return register_filesystem(&efivarfs_type); +} + +MODULE_AUTHOR("Matthew Garrett, Jeremy Kerr"); +MODULE_DESCRIPTION("EFI Variable Filesystem"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_FS("efivarfs"); + +module_init(efivarfs_init); diff --git a/fs/efs/dir.c b/fs/efs/dir.c index 055a9e9ca747..b72307ccdf7a 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c @@ -7,40 +7,38 @@ #include <linux/buffer_head.h> #include "efs.h" -static int efs_readdir(struct file *, void *, filldir_t); +static int efs_readdir(struct file *, struct dir_context *); const struct file_operations efs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = efs_readdir, + .iterate = efs_readdir, }; const struct inode_operations efs_dir_inode_operations = { .lookup = efs_lookup, }; -static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = file_inode(filp); - struct buffer_head *bh; - - struct efs_dir *dirblock; - struct efs_dentry *dirslot; - efs_ino_t inodenum; +static int efs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); efs_block_t block; - int slot, namelen; - char *nameptr; + int slot; if (inode->i_size & (EFS_DIRBSIZE-1)) printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n"); /* work out where this entry can be found */ - block = filp->f_pos >> EFS_DIRBSIZE_BITS; + block = ctx->pos >> EFS_DIRBSIZE_BITS; /* each block contains at most 256 slots */ - slot = filp->f_pos & 0xff; + slot = ctx->pos & 0xff; /* look at all blocks */ while (block < inode->i_blocks) { + struct efs_dir *dirblock; + struct buffer_head *bh; + /* read the dir block */ bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); @@ -57,11 +55,14 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { break; } - while (slot < dirblock->slots) { - if (dirblock->space[slot] == 0) { - slot++; + for (; slot < dirblock->slots; slot++) { + struct efs_dentry *dirslot; + efs_ino_t inodenum; + const char *nameptr; + int namelen; + + if (dirblock->space[slot] == 0) continue; - } dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot)); @@ -72,39 +73,29 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { #ifdef DEBUG printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen); #endif - if (namelen > 0) { - /* found the next entry */ - filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; - - /* copy filename and data in dirslot */ - filldir(dirent, nameptr, namelen, filp->f_pos, inodenum, DT_UNKNOWN); - - /* sanity check */ - if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) { - printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot); - slot++; - continue; - } - - /* store position of next slot */ - if (++slot == dirblock->slots) { - slot = 0; - block++; - } + if (!namelen) + continue; + /* found the next entry */ + ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot; + + /* sanity check */ + if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) { + printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot); + continue; + } + + /* copy filename and data in dirslot */ + if (!dir_emit(ctx, nameptr, namelen, inodenum, DT_UNKNOWN)) { brelse(bh); - filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; - goto out; + return 0; } - slot++; } brelse(bh); slot = 0; block++; } - - filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; -out: + ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot; return 0; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 9fec1836057a..9ad17b15b454 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -34,12 +34,14 @@ #include <linux/mutex.h> #include <linux/anon_inodes.h> #include <linux/device.h> +#include <linux/freezer.h> #include <asm/uaccess.h> #include <asm/io.h> #include <asm/mman.h> #include <linux/atomic.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/compat.h> /* * LOCKING: @@ -104,7 +106,7 @@ struct epoll_filefd { struct file *file; int fd; -}; +} __packed; /* * Structure used to track possible nested calls, for too deep recursions @@ -128,6 +130,8 @@ struct nested_calls { /* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. + * Avoid increasing the size of this struct, there can be many thousands + * of these on a server and we do not want this to take another cache line. */ struct epitem { /* RB tree node used to link this structure to the eventpoll RB tree */ @@ -158,7 +162,7 @@ struct epitem { struct list_head fllink; /* wakeup_source used when EPOLLWAKEUP is set */ - struct wakeup_source *ws; + struct wakeup_source __rcu *ws; /* The structure that describe the interested events and the source fd */ struct epoll_event event; @@ -536,6 +540,38 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) } } +/* call only when ep->mtx is held */ +static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi) +{ + return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx)); +} + +/* call only when ep->mtx is held */ +static inline void ep_pm_stay_awake(struct epitem *epi) +{ + struct wakeup_source *ws = ep_wakeup_source(epi); + + if (ws) + __pm_stay_awake(ws); +} + +static inline bool ep_has_wakeup_source(struct epitem *epi) +{ + return rcu_access_pointer(epi->ws) ? true : false; +} + +/* call when ep->mtx cannot be held (ep_poll_callback) */ +static inline void ep_pm_stay_awake_rcu(struct epitem *epi) +{ + struct wakeup_source *ws; + + rcu_read_lock(); + ws = rcu_dereference(epi->ws); + if (ws) + __pm_stay_awake(ws); + rcu_read_unlock(); +} + /** * ep_scan_ready_list - Scans the ready list in a way that makes possible for * the scan code, to call f_op->poll(). Also allows for @@ -599,7 +635,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, */ if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); } } /* @@ -668,7 +704,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); - wakeup_source_unregister(epi->ws); + wakeup_source_unregister(ep_wakeup_source(epi)); /* At this point it is safe to free the eventpoll item */ kmem_cache_free(epi_cache, epi); @@ -711,11 +747,15 @@ static void ep_free(struct eventpoll *ep) * point we are sure no poll callbacks will be lingering around, and also by * holding "epmutex" we can be sure that no file cleanup code will hit * us during this operation. So we can avoid the lock on "ep->lock". + * We do not need to lock ep->mtx, either, we only do it to prevent + * a lockdep warning. */ + mutex_lock(&ep->mtx); while ((rbp = rb_first(&ep->rbr)) != NULL) { epi = rb_entry(rbp, struct epitem, rbn); ep_remove(ep, epi); } + mutex_unlock(&ep->mtx); mutex_unlock(&epmutex); mutex_destroy(&ep->mtx); @@ -734,6 +774,13 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) return 0; } +static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt) +{ + pt->_key = epi->event.events; + + return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events; +} + static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) { @@ -741,10 +788,9 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, poll_table pt; init_poll_funcptr(&pt, NULL); + list_for_each_entry_safe(epi, tmp, head, rdllink) { - pt._key = epi->event.events; - if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) & - epi->event.events) + if (ep_item_poll(epi, &pt)) return POLLIN | POLLRDNORM; else { /* @@ -752,7 +798,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ - __pm_relax(epi->ws); + __pm_relax(ep_wakeup_source(epi)); list_del_init(&epi->rdllink); } } @@ -984,7 +1030,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k /* If this file is already in the ready list we exit soon */ if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake_rcu(epi); } /* @@ -1146,6 +1192,7 @@ static int reverse_path_check(void) static int ep_create_wakeup_source(struct epitem *epi) { const char *name; + struct wakeup_source *ws; if (!epi->ep->ws) { epi->ep->ws = wakeup_source_register("eventpoll"); @@ -1154,17 +1201,29 @@ static int ep_create_wakeup_source(struct epitem *epi) } name = epi->ffd.file->f_path.dentry->d_name.name; - epi->ws = wakeup_source_register(name); - if (!epi->ws) + ws = wakeup_source_register(name); + + if (!ws) return -ENOMEM; + rcu_assign_pointer(epi->ws, ws); return 0; } -static void ep_destroy_wakeup_source(struct epitem *epi) +/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ +static noinline void ep_destroy_wakeup_source(struct epitem *epi) { - wakeup_source_unregister(epi->ws); - epi->ws = NULL; + struct wakeup_source *ws = ep_wakeup_source(epi); + + RCU_INIT_POINTER(epi->ws, NULL); + + /* + * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is + * used internally by wakeup_source_remove, too (called by + * wakeup_source_unregister), so we cannot use call_rcu + */ + synchronize_rcu(); + wakeup_source_unregister(ws); } /* @@ -1199,13 +1258,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, if (error) goto error_create_wakeup_source; } else { - epi->ws = NULL; + RCU_INIT_POINTER(epi->ws, NULL); } /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); - epq.pt._key = event->events; /* * Attach the item to the poll hooks and get current event bits. @@ -1214,7 +1272,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, * this operation completes, the poll callback can start hitting * the new item. */ - revents = tfile->f_op->poll(tfile, &epq.pt); + revents = ep_item_poll(epi, &epq.pt); /* * We have to check if something went wrong during the poll wait queue @@ -1247,7 +1305,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) @@ -1288,7 +1346,7 @@ error_unregister: list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); - wakeup_source_unregister(epi->ws); + wakeup_source_unregister(ep_wakeup_source(epi)); error_create_wakeup_source: kmem_cache_free(epi_cache, epi); @@ -1314,12 +1372,11 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; /* need barrier below */ - pt._key = event->events; epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { - if (!epi->ws) + if (!ep_has_wakeup_source(epi)) ep_create_wakeup_source(epi); - } else if (epi->ws) { + } else if (ep_has_wakeup_source(epi)) { ep_destroy_wakeup_source(epi); } @@ -1347,7 +1404,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ - revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt); + revents = ep_item_poll(epi, &pt); /* * If the item is "hot" and it is not registered inside the ready @@ -1357,7 +1414,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even spin_lock_irq(&ep->lock); if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) @@ -1383,6 +1440,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, unsigned int revents; struct epitem *epi; struct epoll_event __user *uevent; + struct wakeup_source *ws; poll_table pt; init_poll_funcptr(&pt, NULL); @@ -1405,14 +1463,16 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, * instead, but then epi->ws would temporarily be out of sync * with ep_is_linked(). */ - if (epi->ws && epi->ws->active) - __pm_stay_awake(ep->ws); - __pm_relax(epi->ws); + ws = ep_wakeup_source(epi); + if (ws) { + if (ws->active) + __pm_stay_awake(ep->ws); + __pm_relax(ws); + } + list_del_init(&epi->rdllink); - pt._key = epi->event.events; - revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) & - epi->event.events; + revents = ep_item_poll(epi, &pt); /* * If the event mask intersect the caller-requested one, @@ -1424,7 +1484,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); return eventcnt ? eventcnt : -EFAULT; } eventcnt++; @@ -1444,7 +1504,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); } } } @@ -1543,7 +1603,8 @@ fetch_events: } spin_unlock_irqrestore(&ep->lock, flags); - if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + if (!freezable_schedule_hrtimeout_range(to, slack, + HRTIMER_MODE_ABS)) timed_out = 1; spin_lock_irqsave(&ep->lock, flags); @@ -1916,8 +1977,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, return -EINVAL; if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) return -EFAULT; - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + sigsaved = current->blocked; + set_current_blocked(&ksigmask); } error = sys_epoll_wait(epfd, events, maxevents, timeout); @@ -1934,12 +1995,58 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, sizeof(sigsaved)); set_restore_sigmask(); } else - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + set_current_blocked(&sigsaved); } return error; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, + struct epoll_event __user *, events, + int, maxevents, int, timeout, + const compat_sigset_t __user *, sigmask, + compat_size_t, sigsetsize) +{ + long err; + compat_sigset_t csigmask; + sigset_t ksigmask, sigsaved; + + /* + * If the caller wants a certain signal mask to be set during the wait, + * we apply it here. + */ + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) + return -EFAULT; + sigset_from_compat(&ksigmask, &csigmask); + sigsaved = current->blocked; + set_current_blocked(&ksigmask); + } + + err = sys_epoll_wait(epfd, events, maxevents, timeout); + + /* + * If we changed the signal mask, we need to restore the original one. + * In case we've got a signal while waiting, we do not restore the + * signal mask yet, and we allow do_signal() to deliver the signal on + * the way back to userspace, before the signal mask is restored. + */ + if (sigmask) { + if (err == -EINTR) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } else + set_current_blocked(&sigsaved); + } + + return err; +} +#endif + static int __init eventpoll_init(void) { struct sysinfo si; @@ -1964,6 +2071,12 @@ static int __init eventpoll_init(void) /* Initialize the structure used to perform file's f_op->poll() calls */ ep_nested_calls_init(&poll_readywalk_ncalls); + /* + * We can have many thousands of epitems, so prevent this from + * using an extra cache line on 64-bit (and smaller) CPUs + */ + BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128); + /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); diff --git a/fs/exec.c b/fs/exec.c index a96a4885bbbf..9c73def87642 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -110,13 +110,14 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) static const struct open_flags uselib_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN, - .intent = LOOKUP_OPEN + .intent = LOOKUP_OPEN, + .lookup_flags = LOOKUP_FOLLOW, }; if (IS_ERR(tmp)) goto out; - file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW); + file = do_filp_open(AT_FDCWD, tmp, &uselib_flags); putname(tmp); error = PTR_ERR(file); if (IS_ERR(file)) @@ -613,7 +614,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : 0); + vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch @@ -622,7 +623,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : 0); + vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb, new_end, old_end); @@ -756,10 +757,11 @@ struct file *open_exec(const char *name) static const struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC | MAY_OPEN, - .intent = LOOKUP_OPEN + .intent = LOOKUP_OPEN, + .lookup_flags = LOOKUP_FOLLOW, }; - file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags, LOOKUP_FOLLOW); + file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags); if (IS_ERR(file)) goto out; @@ -802,6 +804,15 @@ int kernel_read(struct file *file, loff_t offset, EXPORT_SYMBOL(kernel_read); +ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) +{ + ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos); + if (res > 0) + flush_icache_range(addr, addr + len); + return res; +} +EXPORT_SYMBOL(read_code); + static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; @@ -898,11 +909,13 @@ static int de_thread(struct task_struct *tsk) sig->notify_count = -1; /* for exit_notify() */ for (;;) { + threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); + threadgroup_change_end(tsk); schedule(); if (unlikely(__fatal_signal_pending(tsk))) goto killed; @@ -919,6 +932,7 @@ static int de_thread(struct task_struct *tsk) * also take its birthdate (always earlier than our own). */ tsk->start_time = leader->start_time; + tsk->real_start_time = leader->real_start_time; BUG_ON(!same_thread_group(leader, tsk)); BUG_ON(has_group_leader_pid(tsk)); @@ -934,9 +948,8 @@ static int de_thread(struct task_struct *tsk) * Note: The old leader also uses this pid until release_task * is called. Odd but simple and correct. */ - detach_pid(tsk, PIDTYPE_PID); tsk->pid = leader->pid; - attach_pid(tsk, PIDTYPE_PID, task_pid(leader)); + change_pid(tsk, PIDTYPE_PID, task_pid(leader)); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); @@ -960,6 +973,7 @@ static int de_thread(struct task_struct *tsk) if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); write_unlock_irq(&tasklist_lock); + threadgroup_change_end(tsk); release_task(leader); } @@ -1027,17 +1041,7 @@ EXPORT_SYMBOL_GPL(get_task_comm); void set_task_comm(struct task_struct *tsk, char *buf) { task_lock(tsk); - trace_task_rename(tsk, buf); - - /* - * Threads may access current->comm without holding - * the task lock, so write the string carefully. - * Readers without a lock may see incomplete new - * names but are safe from non-terminating string reads. - */ - memset(tsk->comm, 0, TASK_COMM_LEN); - wmb(); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk); @@ -1133,13 +1137,6 @@ void setup_new_exec(struct linux_binprm * bprm) set_dumpable(current->mm, suid_dumpable); } - /* - * Flush performance counters when crossing a - * security domain: - */ - if (!get_dumpable(current->mm)) - perf_event_exit_task(current); - /* An exec changes our domain. We are no longer part of the thread group */ @@ -1203,6 +1200,15 @@ void install_exec_creds(struct linux_binprm *bprm) commit_creds(bprm->cred); bprm->cred = NULL; + + /* + * Disable monitoring for regular users + * when executing setuid binaries. Must + * wait until new credentials are committed + * by commit_creds() above + */ + if (get_dumpable(current->mm) != SUID_DUMP_USER) + perf_event_exit_task(current); /* * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's @@ -1459,7 +1465,6 @@ static int do_execve_common(const char *filename, struct files_struct *displaced; bool clear_in_exec; int retval; - const struct cred *cred = current_cred(); /* * We move the actual failure in case of RLIMIT_NPROC excess from @@ -1468,7 +1473,7 @@ static int do_execve_common(const char *filename, * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && - atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) { + atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) { retval = -EAGAIN; goto out_ret; } diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 46375896cfc0..49f51ab4caac 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -239,22 +239,19 @@ void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode) } static int -exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) +exofs_readdir(struct file *file, struct dir_context *ctx) { - loff_t pos = filp->f_pos; - struct inode *inode = file_inode(filp); + loff_t pos = ctx->pos; + struct inode *inode = file_inode(file); unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(exofs_chunk_size(inode)-1); - unsigned char *types = NULL; - int need_revalidate = (filp->f_version != inode->i_version); + int need_revalidate = (file->f_version != inode->i_version); if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1)) return 0; - types = exofs_filetype_table; - for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; struct exofs_dir_entry *de; @@ -263,7 +260,7 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (IS_ERR(page)) { EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n", inode->i_ino); - filp->f_pos += PAGE_CACHE_SIZE - offset; + ctx->pos += PAGE_CACHE_SIZE - offset; return PTR_ERR(page); } kaddr = page_address(page); @@ -271,9 +268,9 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (offset) { offset = exofs_validate_entry(kaddr, offset, chunk_mask); - filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; + ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset; } - filp->f_version = inode->i_version; + file->f_version = inode->i_version; need_revalidate = 0; } de = (struct exofs_dir_entry *)(kaddr + offset); @@ -288,27 +285,24 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) return -EIO; } if (de->inode_no) { - int over; - unsigned char d_type = DT_UNKNOWN; + unsigned char t; - if (types && de->file_type < EXOFS_FT_MAX) - d_type = types[de->file_type]; + if (de->file_type < EXOFS_FT_MAX) + t = exofs_filetype_table[de->file_type]; + else + t = DT_UNKNOWN; - offset = (char *)de - kaddr; - over = filldir(dirent, de->name, de->name_len, - (n<<PAGE_CACHE_SHIFT) | offset, + if (!dir_emit(ctx, de->name, de->name_len, le64_to_cpu(de->inode_no), - d_type); - if (over) { + t)) { exofs_put_page(page); return 0; } } - filp->f_pos += le16_to_cpu(de->rec_len); + ctx->pos += le16_to_cpu(de->rec_len); } exofs_put_page(page); } - return 0; } @@ -669,5 +663,5 @@ not_empty: const struct file_operations exofs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = exofs_readdir, + .iterate = exofs_readdir, }; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index d1f80abd8828..2ec8eb1ab269 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -953,9 +953,11 @@ static int exofs_releasepage(struct page *page, gfp_t gfp) return 0; } -static void exofs_invalidatepage(struct page *page, unsigned long offset) +static void exofs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { - EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset); + EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n", + page->index, offset, length); WARN_ON(1); } diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index f936cb50dc0d..b74422888604 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -401,7 +401,7 @@ static void _clear_bio(struct bio *bio) struct bio_vec *bv; unsigned i; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { unsigned this_count = bv->bv_len; if (likely(PAGE_SIZE == this_count)) diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index b963f38ac298..7682b970d0f1 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -432,7 +432,7 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) if (!bio) continue; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; SetPageUptodate(page); diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 262fc9940982..293bc2e47a73 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -212,6 +212,7 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf) } struct getdents_callback { + struct dir_context ctx; char *name; /* name that was found. It already points to a buffer NAME_MAX+1 is size */ unsigned long ino; /* the inum we are looking for */ @@ -254,7 +255,11 @@ static int get_name(const struct path *path, char *name, struct dentry *child) struct inode *dir = path->dentry->d_inode; int error; struct file *file; - struct getdents_callback buffer; + struct getdents_callback buffer = { + .ctx.actor = filldir_one, + .name = name, + .ino = child->d_inode->i_ino + }; error = -ENOTDIR; if (!dir || !S_ISDIR(dir->i_mode)) @@ -271,17 +276,14 @@ static int get_name(const struct path *path, char *name, struct dentry *child) goto out; error = -EINVAL; - if (!file->f_op->readdir) + if (!file->f_op->iterate) goto out_close; - buffer.name = name; - buffer.ino = child->d_inode->i_ino; - buffer.found = 0; buffer.sequence = 0; while (1) { int old_seq = buffer.sequence; - error = vfs_readdir(file, filldir_one, &buffer); + error = iterate_dir(file, &buffer.ctx); if (buffer.found) { error = 0; break; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 4237722bfd27..6e1d4ab09d72 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -287,17 +287,17 @@ static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode) } static int -ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) +ext2_readdir(struct file *file, struct dir_context *ctx) { - loff_t pos = filp->f_pos; - struct inode *inode = file_inode(filp); + loff_t pos = ctx->pos; + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); unsigned char *types = NULL; - int need_revalidate = filp->f_version != inode->i_version; + int need_revalidate = file->f_version != inode->i_version; if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) return 0; @@ -314,16 +314,16 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) ext2_error(sb, __func__, "bad page in #%lu", inode->i_ino); - filp->f_pos += PAGE_CACHE_SIZE - offset; + ctx->pos += PAGE_CACHE_SIZE - offset; return PTR_ERR(page); } kaddr = page_address(page); if (unlikely(need_revalidate)) { if (offset) { offset = ext2_validate_entry(kaddr, offset, chunk_mask); - filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; + ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset; } - filp->f_version = inode->i_version; + file->f_version = inode->i_version; need_revalidate = 0; } de = (ext2_dirent *)(kaddr+offset); @@ -336,22 +336,19 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) return -EIO; } if (de->inode) { - int over; unsigned char d_type = DT_UNKNOWN; if (types && de->file_type < EXT2_FT_MAX) d_type = types[de->file_type]; - offset = (char *)de - kaddr; - over = filldir(dirent, de->name, de->name_len, - (n<<PAGE_CACHE_SHIFT) | offset, - le32_to_cpu(de->inode), d_type); - if (over) { + if (!dir_emit(ctx, de->name, de->name_len, + le32_to_cpu(de->inode), + d_type)) { ext2_put_page(page); return 0; } } - filp->f_pos += ext2_rec_len_from_disk(de->rec_len); + ctx->pos += ext2_rec_len_from_disk(de->rec_len); } ext2_put_page(page); } @@ -724,7 +721,7 @@ not_empty: const struct file_operations ext2_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = ext2_readdir, + .iterate = ext2_readdir, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index fe60cc1117d8..0a87bb10998d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -31,6 +31,7 @@ #include <linux/mpage.h> #include <linux/fiemap.h> #include <linux/namei.h> +#include <linux/aio.h> #include "ext2.h" #include "acl.h" #include "xip.h" diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 73b0d9519836..256dd5f4c1c4 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -119,6 +119,29 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode return ext2_add_nondir(dentry, inode); } +static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode = ext2_new_inode(dir, mode, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &ext2_file_inode_operations; + if (ext2_use_xip(inode->i_sb)) { + inode->i_mapping->a_ops = &ext2_aops_xip; + inode->i_fop = &ext2_xip_file_operations; + } else if (test_opt(inode->i_sb, NOBH)) { + inode->i_mapping->a_ops = &ext2_nobh_aops; + inode->i_fop = &ext2_file_operations; + } else { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_file_operations; + } + mark_inode_dirty(inode); + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + return 0; +} + static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode * inode; @@ -398,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = { #endif .setattr = ext2_setattr, .get_acl = ext2_get_acl, + .tmpfile = ext2_tmpfile, }; const struct inode_operations ext2_special_inode_operations = { diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 87eccbbca255..f522425aaa24 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -28,8 +28,7 @@ static unsigned char ext3_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int ext3_dx_readdir(struct file * filp, - void * dirent, filldir_t filldir); +static int ext3_dx_readdir(struct file *, struct dir_context *); static unsigned char get_dtype(struct super_block *sb, int filetype) { @@ -91,36 +90,30 @@ int ext3_check_dir_entry (const char * function, struct inode * dir, return error_msg == NULL ? 1 : 0; } -static int ext3_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int ext3_readdir(struct file *file, struct dir_context *ctx) { - int error = 0; unsigned long offset; - int i, stored; + int i; struct ext3_dir_entry_2 *de; int err; - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - int ret = 0; int dir_has_error = 0; if (is_dx_dir(inode)) { - err = ext3_dx_readdir(filp, dirent, filldir); - if (err != ERR_BAD_DX_DIR) { - ret = err; - goto out; - } + err = ext3_dx_readdir(file, ctx); + if (err != ERR_BAD_DX_DIR) + return err; /* * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - EXT3_I(file_inode(filp))->i_flags &= ~EXT3_INDEX_FL; + EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; } - stored = 0; - offset = filp->f_pos & (sb->s_blocksize - 1); + offset = ctx->pos & (sb->s_blocksize - 1); - while (!error && !stored && filp->f_pos < inode->i_size) { - unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb); + while (ctx->pos < inode->i_size) { + unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb); struct buffer_head map_bh; struct buffer_head *bh = NULL; @@ -129,12 +122,12 @@ static int ext3_readdir(struct file * filp, if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - if (!ra_has_index(&filp->f_ra, index)) + if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_inode->i_mapping, - &filp->f_ra, filp, + &file->f_ra, file, index, 1); - filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; bh = ext3_bread(NULL, inode, blk, 0, &err); } @@ -146,22 +139,21 @@ static int ext3_readdir(struct file * filp, if (!dir_has_error) { ext3_error(sb, __func__, "directory #%lu " "contains a hole at offset %lld", - inode->i_ino, filp->f_pos); + inode->i_ino, ctx->pos); dir_has_error = 1; } /* corrupt size? Maybe no more blocks to read */ - if (filp->f_pos > inode->i_blocks << 9) + if (ctx->pos > inode->i_blocks << 9) break; - filp->f_pos += sb->s_blocksize - offset; + ctx->pos += sb->s_blocksize - offset; continue; } -revalidate: /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (filp->f_version != inode->i_version) { + if (offset && file->f_version != inode->i_version) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext3_dir_entry_2 *) (bh->b_data + i); @@ -177,53 +169,40 @@ revalidate: i += ext3_rec_len_from_disk(de->rec_len); } offset = i; - filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; - filp->f_version = inode->i_version; + file->f_version = inode->i_version; } - while (!error && filp->f_pos < inode->i_size + while (ctx->pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); if (!ext3_check_dir_entry ("ext3_readdir", inode, de, bh, offset)) { - /* On error, skip the f_pos to the + /* On error, skip the to the next block. */ - filp->f_pos = (filp->f_pos | + ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; - brelse (bh); - ret = stored; - goto out; + break; } offset += ext3_rec_len_from_disk(de->rec_len); if (le32_to_cpu(de->inode)) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. - */ - u64 version = filp->f_version; - - error = filldir(dirent, de->name, - de->name_len, - filp->f_pos, - le32_to_cpu(de->inode), - get_dtype(sb, de->file_type)); - if (error) - break; - if (version != filp->f_version) - goto revalidate; - stored ++; + if (!dir_emit(ctx, de->name, de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) { + brelse(bh); + return 0; + } } - filp->f_pos += ext3_rec_len_from_disk(de->rec_len); + ctx->pos += ext3_rec_len_from_disk(de->rec_len); } offset = 0; brelse (bh); + if (ctx->pos < inode->i_size) + if (!dir_relax(inode)) + return 0; } -out: - return ret; + return 0; } static inline int is_32bit_api(void) @@ -452,62 +431,54 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, * for all entres on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ -static int call_filldir(struct file * filp, void * dirent, - filldir_t filldir, struct fname *fname) +static bool call_filldir(struct file *file, struct dir_context *ctx, + struct fname *fname) { - struct dir_private_info *info = filp->private_data; - loff_t curr_pos; - struct inode *inode = file_inode(filp); - struct super_block * sb; - int error; - - sb = inode->i_sb; + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; if (!fname) { printk("call_filldir: called with null fname?!?\n"); - return 0; + return true; } - curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); + ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); while (fname) { - error = filldir(dirent, fname->name, - fname->name_len, curr_pos, + if (!dir_emit(ctx, fname->name, fname->name_len, fname->inode, - get_dtype(sb, fname->file_type)); - if (error) { - filp->f_pos = curr_pos; + get_dtype(sb, fname->file_type))) { info->extra_fname = fname; - return error; + return false; } fname = fname->next; } - return 0; + return true; } -static int ext3_dx_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int ext3_dx_readdir(struct file *file, struct dir_context *ctx) { - struct dir_private_info *info = filp->private_data; - struct inode *inode = file_inode(filp); + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); struct fname *fname; int ret; if (!info) { - info = ext3_htree_create_dir_info(filp, filp->f_pos); + info = ext3_htree_create_dir_info(file, ctx->pos); if (!info) return -ENOMEM; - filp->private_data = info; + file->private_data = info; } - if (filp->f_pos == ext3_get_htree_eof(filp)) + if (ctx->pos == ext3_get_htree_eof(file)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ - if (info->last_pos != filp->f_pos) { + if (info->last_pos != ctx->pos) { free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(filp, filp->f_pos); - info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); + info->curr_hash = pos2maj_hash(file, ctx->pos); + info->curr_minor_hash = pos2min_hash(file, ctx->pos); } /* @@ -515,7 +486,7 @@ static int ext3_dx_readdir(struct file * filp, * chain, return them first. */ if (info->extra_fname) { - if (call_filldir(filp, dirent, filldir, info->extra_fname)) + if (!call_filldir(file, ctx, info->extra_fname)) goto finished; info->extra_fname = NULL; goto next_node; @@ -529,17 +500,17 @@ static int ext3_dx_readdir(struct file * filp, * cached entries. */ if ((!info->curr_node) || - (filp->f_version != inode->i_version)) { + (file->f_version != inode->i_version)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); - filp->f_version = inode->i_version; - ret = ext3_htree_fill_tree(filp, info->curr_hash, + file->f_version = inode->i_version; + ret = ext3_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); if (ret < 0) return ret; if (ret == 0) { - filp->f_pos = ext3_get_htree_eof(filp); + ctx->pos = ext3_get_htree_eof(file); break; } info->curr_node = rb_first(&info->root); @@ -548,7 +519,7 @@ static int ext3_dx_readdir(struct file * filp, fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; - if (call_filldir(filp, dirent, filldir, fname)) + if (!call_filldir(file, ctx, fname)) break; next_node: info->curr_node = rb_next(info->curr_node); @@ -559,7 +530,7 @@ static int ext3_dx_readdir(struct file * filp, info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { - filp->f_pos = ext3_get_htree_eof(filp); + ctx->pos = ext3_get_htree_eof(file); break; } info->curr_hash = info->next_hash; @@ -567,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp, } } finished: - info->last_pos = filp->f_pos; + info->last_pos = ctx->pos; return 0; } @@ -582,7 +553,7 @@ static int ext3_release_dir (struct inode * inode, struct file * filp) const struct file_operations ext3_dir_operations = { .llseek = ext3_dir_llseek, .read = generic_read_dir, - .readdir = ext3_readdir, + .iterate = ext3_readdir, .unlocked_ioctl = ext3_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext3_compat_ioctl, diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index b31dbd4c46ad..1cb9c7e10c6f 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -48,9 +48,13 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_ext3_sync_file_enter(file, datasync); - if (inode->i_sb->s_flags & MS_RDONLY) + if (inode->i_sb->s_flags & MS_RDONLY) { + /* Make sure that we read updated state */ + smp_rmb(); + if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS) + return -EROFS; return 0; - + } ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) goto out; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index d512c4bc4ad7..2bd85486b879 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -27,6 +27,7 @@ #include <linux/writeback.h> #include <linux/mpage.h> #include <linux/namei.h> +#include <linux/aio.h> #include "ext3.h" #include "xattr.h" #include "acl.h" @@ -218,7 +219,8 @@ void ext3_evict_inode (struct inode *inode) */ if (inode->i_nlink && ext3_should_journal_data(inode) && EXT3_SB(inode->i_sb)->s_journal && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT3_JOURNAL_INO) { tid_t commit_tid = atomic_read(&ei->i_datasync_tid); journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; @@ -1823,19 +1825,20 @@ ext3_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); } -static void ext3_invalidatepage(struct page *page, unsigned long offset) +static void ext3_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); - trace_ext3_invalidatepage(page, offset); + trace_ext3_invalidatepage(page, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) ClearPageChecked(page); - journal_invalidatepage(journal, page, offset); + journal_invalidatepage(journal, page, offset, length); } static int ext3_releasepage(struct page *page, gfp_t wait) @@ -1982,6 +1985,7 @@ static const struct address_space_operations ext3_ordered_aops = { .direct_IO = ext3_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .is_dirty_writeback = buffer_check_dirty_writeback, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 692de13e3596..998ea111e537 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -576,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb)) +((char *)de - bh->b_data))) { - /* On error, skip the f_pos to the next block. */ - dir_file->f_pos = (dir_file->f_pos | - (dir->i_sb->s_blocksize - 1)) + 1; - brelse (bh); - return count; + /* silently ignore the rest of the block */ + break; } ext3fs_dirhash(de->name, de->name_len, hinfo); if ((hinfo->hash < start_hash) || @@ -1762,6 +1759,45 @@ retry: return err; } +static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + dquot_initialize(dir); + +retry: + handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + 4 + EXT3_XATTR_TRANS_BLOCKS); + + if (IS_ERR(handle)) + return PTR_ERR(handle); + + inode = ext3_new_inode (handle, dir, NULL, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + ext3_set_aops(inode); + err = ext3_orphan_add(handle, inode); + if (err) + goto err_drop_inode; + mark_inode_dirty(inode); + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + } + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +err_drop_inode: + ext3_journal_stop(handle); + unlock_new_inode(inode); + iput(inode); + return err; +} + static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) { handle_t *handle; @@ -2303,7 +2339,7 @@ static int ext3_link (struct dentry * old_dentry, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS); + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2317,6 +2353,11 @@ retry: err = ext3_add_entry(handle, dentry, inode); if (!err) { ext3_mark_inode_dirty(handle, inode); + /* this can happen only for tmpfile being + * linked the first time + */ + if (inode->i_nlink == 1) + ext3_orphan_del(handle, inode); d_instantiate(dentry, inode); } else { drop_nlink(inode); @@ -2519,6 +2560,7 @@ const struct inode_operations ext3_dir_inode_operations = { .mkdir = ext3_mkdir, .rmdir = ext3_rmdir, .mknod = ext3_mknod, + .tmpfile = ext3_tmpfile, .rename = ext3_rename, .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR diff --git a/fs/ext3/super.c b/fs/ext3/super.c index fb5120a5505c..c47f14750722 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -174,6 +174,11 @@ static void ext3_handle_error(struct super_block *sb) if (test_opt (sb, ERRORS_RO)) { ext3_msg(sb, KERN_CRIT, "error: remounting filesystem read-only"); + /* + * Make sure updated value of ->s_mount_state will be visible + * before ->s_flags update. + */ + smp_wmb(); sb->s_flags |= MS_RDONLY; } ext3_commit_super(sb, es, 1); @@ -291,8 +296,14 @@ void ext3_abort(struct super_block *sb, const char *function, ext3_msg(sb, KERN_CRIT, "error: remounting filesystem read-only"); EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; - sb->s_flags |= MS_RDONLY; set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); + /* + * Make sure updated value of ->s_mount_state will be visible + * before ->s_flags update. + */ + smp_wmb(); + sb->s_flags |= MS_RDONLY; + if (EXT3_SB(sb)->s_journal) journal_abort(EXT3_SB(sb)->s_journal, -EIO); } @@ -362,22 +373,19 @@ fail: /* * Release the journal device */ -static int ext3_blkdev_put(struct block_device *bdev) +static void ext3_blkdev_put(struct block_device *bdev) { - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } -static int ext3_blkdev_remove(struct ext3_sb_info *sbi) +static void ext3_blkdev_remove(struct ext3_sb_info *sbi) { struct block_device *bdev; - int ret = -ENODEV; - bdev = sbi->journal_bdev; if (bdev) { - ret = ext3_blkdev_put(bdev); + ext3_blkdev_put(bdev); sbi->journal_bdev = NULL; } - return ret; } static inline struct inode *orphan_list_entry(struct list_head *l) @@ -2067,7 +2075,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); - sb->s_flags |= MS_SNAP_STABLE; return 0; diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 987358740cb9..efea5d5c44ce 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -71,4 +71,5 @@ config EXT4_DEBUG Enables run-time debugging support for the ext4 filesystem. If you select Y here, then you will be able to turn on debugging - with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug" + with a command such as: + echo 1 > /sys/module/ext4/parameters/mballoc_debug diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 92e68b33fffd..58339393fa6e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -30,6 +30,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, */ /* + * Calculate block group number for a given block number + */ +ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block) +{ + ext4_group_t group; + + if (test_opt2(sb, STD_GROUP_SIZE)) + group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + + block) >> + (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); + else + ext4_get_group_no_and_offset(sb, block, &group, NULL); + return group; +} + +/* * Calculate the block group number and offset into the block/cluster * allocation bitmap, given a block number */ @@ -49,14 +66,18 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, } -static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, - ext4_group_t block_group) +/* + * Check whether the 'block' lives within the 'block_group'. Returns 1 if so + * and 0 otherwise. + */ +static inline int ext4_block_in_group(struct super_block *sb, + ext4_fsblk_t block, + ext4_group_t block_group) { ext4_group_t actual_group; - ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); - if (actual_group == block_group) - return 1; - return 0; + + actual_group = ext4_get_group_number(sb, block); + return (actual_group == block_group) ? 1 : 0; } /* Return the number of clusters used for file system metadata; this @@ -420,7 +441,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) trace_ext4_read_block_bitmap_load(sb, block_group); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); - submit_bh(READ, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); return bh; verify: ext4_validate_block_bitmap(sb, desc, block_group, bh); @@ -478,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) static int ext4_has_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { - s64 free_clusters, dirty_clusters, root_clusters; + s64 free_clusters, dirty_clusters, rsv, resv_clusters; struct percpu_counter *fcc = &sbi->s_freeclusters_counter; struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; free_clusters = percpu_counter_read_positive(fcc); dirty_clusters = percpu_counter_read_positive(dcc); + resv_clusters = atomic64_read(&sbi->s_resv_clusters); /* * r_blocks_count should always be multiple of the cluster ratio so * we are safe to do a plane bit shift only. */ - root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; + rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + + resv_clusters; - if (free_clusters - (nclusters + root_clusters + dirty_clusters) < + if (free_clusters - (nclusters + rsv + dirty_clusters) < EXT4_FREECLUSTERS_WATERMARK) { free_clusters = percpu_counter_sum_positive(fcc); dirty_clusters = percpu_counter_sum_positive(dcc); @@ -499,15 +522,21 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, /* Check whether we have space after accounting for current * dirty clusters & root reserved clusters. */ - if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters)) + if (free_clusters >= (rsv + nclusters + dirty_clusters)) return 1; /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + if (free_clusters >= (nclusters + dirty_clusters + + resv_clusters)) + return 1; + } + /* No free blocks. Let's see if we can dip into reserved pool */ + if (flags & EXT4_MB_USE_RESERVED) { if (free_clusters >= (nclusters + dirty_clusters)) return 1; } @@ -653,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) static inline int test_root(ext4_group_t a, int b) { - int num = b; - - while (a > num) - num *= b; - return num == a; + while (1) { + if (a < b) + return 0; + if (a == b) + return 1; + if ((a % b) != 0) + return 0; + a = a / b; + } } static int ext4_group_sparse(ext4_group_t group) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d8cd1f0f4661..3c7d288ae94c 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -29,8 +29,7 @@ #include "ext4.h" #include "xattr.h" -static int ext4_dx_readdir(struct file *filp, - void *dirent, filldir_t filldir); +static int ext4_dx_readdir(struct file *, struct dir_context *); /** * Check if the given dir-inode refers to an htree-indexed directory @@ -46,7 +45,8 @@ static int is_dx_dir(struct inode *inode) if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_COMPAT_DIR_INDEX) && ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) + ((inode->i_size >> sb->s_blocksize_bits) == 1) || + ext4_has_inline_data(inode))) return 1; return 0; @@ -102,59 +102,56 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, return 1; } -static int ext4_readdir(struct file *filp, - void *dirent, filldir_t filldir) +static int ext4_readdir(struct file *file, struct dir_context *ctx) { - int error = 0; unsigned int offset; int i, stored; struct ext4_dir_entry_2 *de; int err; - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - int ret = 0; int dir_has_error = 0; - if (ext4_has_inline_data(inode)) { - int has_inline_data = 1; - ret = ext4_read_inline_dir(filp, dirent, filldir, - &has_inline_data); - if (has_inline_data) - return ret; - } - if (is_dx_dir(inode)) { - err = ext4_dx_readdir(filp, dirent, filldir); + err = ext4_dx_readdir(file, ctx); if (err != ERR_BAD_DX_DIR) { - ret = err; - goto out; + return err; } /* * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - ext4_clear_inode_flag(file_inode(filp), + ext4_clear_inode_flag(file_inode(file), EXT4_INODE_INDEX); } + + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + int ret = ext4_read_inline_dir(file, ctx, + &has_inline_data); + if (has_inline_data) + return ret; + } + stored = 0; - offset = filp->f_pos & (sb->s_blocksize - 1); + offset = ctx->pos & (sb->s_blocksize - 1); - while (!error && !stored && filp->f_pos < inode->i_size) { + while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; struct buffer_head *bh = NULL; - map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); + map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; err = ext4_map_blocks(NULL, inode, &map, 0); if (err > 0) { pgoff_t index = map.m_pblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - if (!ra_has_index(&filp->f_ra, index)) + if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_inode->i_mapping, - &filp->f_ra, filp, + &file->f_ra, file, index, 1); - filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); } @@ -164,16 +161,16 @@ static int ext4_readdir(struct file *filp, */ if (!bh) { if (!dir_has_error) { - EXT4_ERROR_FILE(filp, 0, + EXT4_ERROR_FILE(file, 0, "directory contains a " "hole at offset %llu", - (unsigned long long) filp->f_pos); + (unsigned long long) ctx->pos); dir_has_error = 1; } /* corrupt size? Maybe no more blocks to read */ - if (filp->f_pos > inode->i_blocks << 9) + if (ctx->pos > inode->i_blocks << 9) break; - filp->f_pos += sb->s_blocksize - offset; + ctx->pos += sb->s_blocksize - offset; continue; } @@ -181,21 +178,20 @@ static int ext4_readdir(struct file *filp, if (!buffer_verified(bh) && !ext4_dirent_csum_verify(inode, (struct ext4_dir_entry *)bh->b_data)) { - EXT4_ERROR_FILE(filp, 0, "directory fails checksum " + EXT4_ERROR_FILE(file, 0, "directory fails checksum " "at offset %llu", - (unsigned long long)filp->f_pos); - filp->f_pos += sb->s_blocksize - offset; + (unsigned long long)ctx->pos); + ctx->pos += sb->s_blocksize - offset; brelse(bh); continue; } set_buffer_verified(bh); -revalidate: /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (filp->f_version != inode->i_version) { + if (file->f_version != inode->i_version) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); @@ -212,57 +208,46 @@ revalidate: sb->s_blocksize); } offset = i; - filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; - filp->f_version = inode->i_version; + file->f_version = inode->i_version; } - while (!error && filp->f_pos < inode->i_size + while (ctx->pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (ext4_check_dir_entry(inode, filp, de, bh, + if (ext4_check_dir_entry(inode, file, de, bh, bh->b_data, bh->b_size, offset)) { /* - * On error, skip the f_pos to the next block + * On error, skip to the next block */ - filp->f_pos = (filp->f_pos | + ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; - brelse(bh); - ret = stored; - goto out; + break; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. - */ - u64 version = filp->f_version; - - error = filldir(dirent, de->name, + if (!dir_emit(ctx, de->name, de->name_len, - filp->f_pos, le32_to_cpu(de->inode), - get_dtype(sb, de->file_type)); - if (error) - break; - if (version != filp->f_version) - goto revalidate; - stored++; + get_dtype(sb, de->file_type))) { + brelse(bh); + return 0; + } } - filp->f_pos += ext4_rec_len_from_disk(de->rec_len, + ctx->pos += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } offset = 0; brelse(bh); + if (ctx->pos < inode->i_size) { + if (!dir_relax(inode)) + return 0; + } } -out: - return ret; + return 0; } static inline int is_32bit_api(void) @@ -490,16 +475,12 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, * for all entres on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ -static int call_filldir(struct file *filp, void *dirent, - filldir_t filldir, struct fname *fname) +static int call_filldir(struct file *file, struct dir_context *ctx, + struct fname *fname) { - struct dir_private_info *info = filp->private_data; - loff_t curr_pos; - struct inode *inode = file_inode(filp); - struct super_block *sb; - int error; - - sb = inode->i_sb; + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; if (!fname) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " @@ -507,47 +488,44 @@ static int call_filldir(struct file *filp, void *dirent, inode->i_ino, current->comm); return 0; } - curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); + ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); while (fname) { - error = filldir(dirent, fname->name, - fname->name_len, curr_pos, + if (!dir_emit(ctx, fname->name, + fname->name_len, fname->inode, - get_dtype(sb, fname->file_type)); - if (error) { - filp->f_pos = curr_pos; + get_dtype(sb, fname->file_type))) { info->extra_fname = fname; - return error; + return 1; } fname = fname->next; } return 0; } -static int ext4_dx_readdir(struct file *filp, - void *dirent, filldir_t filldir) +static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) { - struct dir_private_info *info = filp->private_data; - struct inode *inode = file_inode(filp); + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); struct fname *fname; int ret; if (!info) { - info = ext4_htree_create_dir_info(filp, filp->f_pos); + info = ext4_htree_create_dir_info(file, ctx->pos); if (!info) return -ENOMEM; - filp->private_data = info; + file->private_data = info; } - if (filp->f_pos == ext4_get_htree_eof(filp)) + if (ctx->pos == ext4_get_htree_eof(file)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ - if (info->last_pos != filp->f_pos) { + if (info->last_pos != ctx->pos) { free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(filp, filp->f_pos); - info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); + info->curr_hash = pos2maj_hash(file, ctx->pos); + info->curr_minor_hash = pos2min_hash(file, ctx->pos); } /* @@ -555,7 +533,7 @@ static int ext4_dx_readdir(struct file *filp, * chain, return them first. */ if (info->extra_fname) { - if (call_filldir(filp, dirent, filldir, info->extra_fname)) + if (call_filldir(file, ctx, info->extra_fname)) goto finished; info->extra_fname = NULL; goto next_node; @@ -569,17 +547,17 @@ static int ext4_dx_readdir(struct file *filp, * cached entries. */ if ((!info->curr_node) || - (filp->f_version != inode->i_version)) { + (file->f_version != inode->i_version)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); - filp->f_version = inode->i_version; - ret = ext4_htree_fill_tree(filp, info->curr_hash, + file->f_version = inode->i_version; + ret = ext4_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); if (ret < 0) return ret; if (ret == 0) { - filp->f_pos = ext4_get_htree_eof(filp); + ctx->pos = ext4_get_htree_eof(file); break; } info->curr_node = rb_first(&info->root); @@ -588,7 +566,7 @@ static int ext4_dx_readdir(struct file *filp, fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; - if (call_filldir(filp, dirent, filldir, fname)) + if (call_filldir(file, ctx, fname)) break; next_node: info->curr_node = rb_next(info->curr_node); @@ -599,7 +577,7 @@ static int ext4_dx_readdir(struct file *filp, info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { - filp->f_pos = ext4_get_htree_eof(filp); + ctx->pos = ext4_get_htree_eof(file); break; } info->curr_hash = info->next_hash; @@ -607,7 +585,7 @@ static int ext4_dx_readdir(struct file *filp, } } finished: - info->last_pos = filp->f_pos; + info->last_pos = ctx->pos; return 0; } @@ -622,7 +600,7 @@ static int ext4_release_dir(struct inode *inode, struct file *filp) const struct file_operations ext4_dir_operations = { .llseek = ext4_dir_llseek, .read = generic_read_dir, - .readdir = ext4_readdir, + .iterate = ext4_readdir, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3b83cd604796..b577e45425b0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t; #define EXT4_MB_STREAM_ALLOC 0x0800 /* Use reserved root blocks if needed */ #define EXT4_MB_USE_ROOT_BLOCKS 0x1000 +/* Use blocks from reserved pool */ +#define EXT4_MB_USE_RESERVED 0x2000 struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -175,58 +177,34 @@ struct ext4_map_blocks { }; /* - * For delayed allocation tracking - */ -struct mpage_da_data { - struct inode *inode; - sector_t b_blocknr; /* start block number of extent */ - size_t b_size; /* size of extent */ - unsigned long b_state; /* state of the extent */ - unsigned long first_page, next_page; /* extent of pages */ - struct writeback_control *wbc; - int io_done; - int pages_written; - int retval; -}; - -/* * Flags for ext4_io_end->flags */ #define EXT4_IO_END_UNWRITTEN 0x0001 -#define EXT4_IO_END_ERROR 0x0002 -#define EXT4_IO_END_DIRECT 0x0004 - -struct ext4_io_page { - struct page *p_page; - atomic_t p_count; -}; - -#define MAX_IO_PAGES 128 +#define EXT4_IO_END_DIRECT 0x0002 /* - * For converting uninitialized extents on a work queue. - * - * 'page' is only used from the writepage() path; 'pages' is only used for - * buffered writes; they are used to keep page references until conversion - * takes place. For AIO/DIO, neither field is filled in. + * For converting uninitialized extents on a work queue. 'handle' is used for + * buffered writeback. */ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ + handle_t *handle; /* handle reserved for extent + * conversion */ struct inode *inode; /* file being written to */ + struct bio *bio; /* Linked list of completed + * bios covering the extent */ unsigned int flag; /* unwritten or not */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ struct kiocb *iocb; /* iocb struct for AIO */ int result; /* error value for AIO */ - int num_io_pages; /* for writepages() */ - struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */ + atomic_t count; /* reference counter */ } ext4_io_end_t; struct ext4_io_submit { int io_op; struct bio *io_bio; ext4_io_end_t *io_end; - struct ext4_io_page *io_page; sector_t io_next_block; }; @@ -403,7 +381,7 @@ struct flex_groups { #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */ /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ @@ -557,9 +535,8 @@ enum { #define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ EXT4_GET_BLOCKS_CREATE) - /* Caller is from the delayed allocation writeout path, - so set the magic i_delalloc_reserve_flag after taking the - inode allocation semaphore for */ + /* Caller is from the delayed allocation writeout path + * finally doing the actual allocation of delayed blocks */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 /* caller is from the direct IO path, request to creation of an unitialized extents if not allocated, split the uninitialized @@ -571,8 +548,9 @@ enum { /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) - /* Punch out blocks of an extent */ -#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 + /* Eventual metadata allocation (due to growing extent tree) + * should not fail, so try to use reserved blocks for that.*/ +#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 /* Don't normalize allocation size (used for fallocate) */ #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* Request will not result in inode size update (user for fallocate) */ @@ -593,11 +571,6 @@ enum { #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 /* - * Flags used by ext4_discard_partial_page_buffers - */ -#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001 - -/* * ioctl commands */ #define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS @@ -616,6 +589,7 @@ enum { #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) +#define EXT4_IOC_SWAP_BOOT _IO('f', 17) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -890,6 +864,7 @@ struct ext4_inode_info { rwlock_t i_es_lock; struct list_head i_es_lru; unsigned int i_es_lru_nr; /* protected by i_es_lock */ + unsigned long i_touch_when; /* jiffies of last accessing */ /* ialloc */ ext4_group_t i_last_alloc_group; @@ -914,12 +889,22 @@ struct ext4_inode_info { qsize_t i_reserved_quota; #endif - /* completed IOs that might need unwritten extents handling */ - struct list_head i_completed_io_list; + /* Lock protecting lists below */ spinlock_t i_completed_io_lock; + /* + * Completed IOs that need unwritten extents handling and have + * transaction reserved + */ + struct list_head i_rsv_conversion_list; + /* + * Completed IOs that need unwritten extents handling and don't have + * transaction reserved + */ + struct list_head i_unrsv_conversion_list; atomic_t i_ioend_count; /* Number of outstanding io_end structs */ atomic_t i_unwritten; /* Nr. of inflight conversions pending */ - struct work_struct i_unwritten_work; /* deferred extent conversion */ + struct work_struct i_rsv_conversion_work; + struct work_struct i_unrsv_conversion_work; spinlock_t i_block_reservation_lock; @@ -949,7 +934,7 @@ struct ext4_inode_info { #define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ /* - * Mount flags + * Mount flags set via mount options or defaults */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ @@ -981,8 +966,16 @@ struct ext4_inode_info { #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ +/* + * Mount flags set either automatically (could not be set by mount option) + * based on per file system feature or property or in special cases such as + * distinguishing between explicit mount option definition and default. + */ #define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly specified delalloc */ +#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group + size of blocksize * 8 + blocks */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -1179,6 +1172,7 @@ struct ext4_sb_info { unsigned int s_mount_flags; unsigned int s_def_mount_opt; ext4_fsblk_t s_sb_block; + atomic64_t s_resv_clusters; kuid_t s_resuid; kgid_t s_resgid; unsigned short s_mount_state; @@ -1247,7 +1241,6 @@ struct ext4_sb_info { unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; - unsigned int s_max_writeback_mb_bump; unsigned int s_max_dir_size_kb; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; @@ -1283,8 +1276,10 @@ struct ext4_sb_info { struct flex_groups *s_flex_groups; ext4_group_t s_flex_groups_allocated; - /* workqueue for dio unwritten */ - struct workqueue_struct *dio_unwritten_wq; + /* workqueue for unreserved extent convertions (dio) */ + struct workqueue_struct *unrsv_conversion_wq; + /* workqueue for reserved extent conversions (buffered io) */ + struct workqueue_struct *rsv_conversion_wq; /* timer for periodic error stats printing */ struct timer_list s_err_report; @@ -1309,6 +1304,7 @@ struct ext4_sb_info { /* Reclaim extents from extent status tree */ struct shrinker s_es_shrinker; struct list_head s_es_lru; + unsigned long s_es_last_sorted; struct percpu_counter s_extent_cache_cnt; spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; }; @@ -1333,6 +1329,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) return ino == EXT4_ROOT_INO || ino == EXT4_USR_QUOTA_INO || ino == EXT4_GRP_QUOTA_INO || + ino == EXT4_BOOT_LOADER_INO || ino == EXT4_JOURNAL_INO || ino == EXT4_RESIZE_INO || (ino >= EXT4_FIRST_INO(sb) && @@ -1343,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, struct ext4_io_end *io_end) { if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + /* Writeback has to have coversion transaction reserved */ + WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle && + !(io_end->flag & EXT4_IO_END_DIRECT)); io_end->flag |= EXT4_IO_END_UNWRITTEN; atomic_inc(&EXT4_I(inode)->i_unwritten); } @@ -1374,6 +1374,7 @@ enum { EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ + EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1784,9 +1785,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) */ #define ERR_BAD_DX_DIR -75000 -void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, - ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); - /* * Timeout and state flag for lazy initialization inode thread. */ @@ -1908,6 +1906,13 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, struct buffer_head *bh); /* balloc.c */ +extern void ext4_get_group_no_and_offset(struct super_block *sb, + ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, + ext4_grpblk_t *offsetp); +extern ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block); + extern void ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, unsigned int block_group, @@ -1995,7 +2000,6 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype) /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); -extern int ext4_flush_unwritten_io(struct inode *); /* hash.c */ extern int ext4fs_dirhash(const char *name, int len, struct @@ -2084,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); -extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); +extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); @@ -2092,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); -extern int ext4_discard_partial_page_buffers(handle_t *handle, - struct address_space *mapping, loff_t from, - loff_t length, int flags); +extern int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from); +extern int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length); +extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, @@ -2107,9 +2114,10 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs); extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); -extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); -extern void ext4_ind_truncate(struct inode *inode); -extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); +extern void ext4_ind_truncate(handle_t *, struct inode *inode); +extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t first, ext4_lblk_t stop); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); @@ -2117,6 +2125,7 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); +extern int ext4_ind_migrate(struct inode *inode); /* namei.c */ extern int ext4_dirent_csum_verify(struct inode *inode, @@ -2160,42 +2169,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]); + extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, const char *, ...); -#define ext4_error(sb, message...) __ext4_error(sb, __func__, \ - __LINE__, ## message) extern __printf(5, 6) -void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, +void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, const char *, ...); extern __printf(5, 6) -void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, +void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, const char *, ...); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); extern __printf(4, 5) void __ext4_abort(struct super_block *, const char *, unsigned int, const char *, ...); -#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ - __LINE__, ## message) extern __printf(4, 5) void __ext4_warning(struct super_block *, const char *, unsigned int, const char *, ...); -#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ - __LINE__, ## message) extern __printf(3, 4) -void ext4_msg(struct super_block *, const char *, const char *, ...); +void __ext4_msg(struct super_block *, const char *, const char *, ...); extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, const char *, unsigned int, const char *); -#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ - __LINE__, msg) extern __printf(7, 8) void __ext4_grp_locked_error(const char *, unsigned int, struct super_block *, ext4_group_t, unsigned long, ext4_fsblk_t, const char *, ...); -#define ext4_grp_locked_error(sb, grp, message...) \ - __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) + +#ifdef CONFIG_PRINTK + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ + __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error_file(file, func, line, block, fmt, ...) \ + __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error(sb, fmt, ...) \ + __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_abort(sb, fmt, ...) \ + __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning(sb, fmt, ...) \ + __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_msg(sb, level, fmt, ...) \ + __ext4_msg(sb, level, fmt, ##__VA_ARGS__) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ + __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ + fmt, ##__VA_ARGS__) + +#else + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, " "); \ +} while (0) +#define ext4_error_file(file, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_file(file, "", 0, block, " "); \ +} while (0) +#define ext4_error(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, " "); \ +} while (0) +#define ext4_abort(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_abort(sb, "", 0, " "); \ +} while (0) +#define ext4_warning(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning(sb, "", 0, " "); \ +} while (0) +#define ext4_msg(sb, level, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_msg(sb, "", " "); \ +} while (0) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, "", 0, "") +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ +} while (0) + +#endif + extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); @@ -2306,6 +2369,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb, { struct ext4_group_info ***grp_info; long indexv, indexh; + BUG_ON(group >= EXT4_SB(sb)->s_groups_count); grp_info = EXT4_SB(sb)->s_group_info; indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); @@ -2509,8 +2573,13 @@ extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode); extern int ext4_read_inline_dir(struct file *filp, - void *dirent, filldir_t filldir, + struct dir_context *ctx, int *has_inline_data); +extern int htree_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, @@ -2547,6 +2616,24 @@ extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t, extern int ext4_handle_dirty_dirent_node(handle_t *handle, struct inode *inode, struct buffer_head *bh); +#define S_SHIFT 12 +static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; @@ -2569,19 +2656,18 @@ struct ext4_extent; extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); -extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, - int chunk); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern void ext4_ext_truncate(struct inode *); -extern int ext4_ext_punch_hole(struct file *file, loff_t offset, - loff_t length); +extern void ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); -extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, - ssize_t len); +extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_metadata_amount(struct inode *inode, @@ -2609,18 +2695,27 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* move_extent.c */ +extern void ext4_double_down_write_data_sem(struct inode *first, + struct inode *second); +extern void ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode); +void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2); +void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2); extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); /* page-io.c */ extern int __init ext4_init_pageio(void); -extern void ext4_add_complete_io(ext4_io_end_t *io_end); extern void ext4_exit_pageio(void); -extern void ext4_ioend_shutdown(struct inode *); -extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); -extern void ext4_end_io_work(struct work_struct *work); +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); +extern int ext4_put_io_end(ext4_io_end_t *io_end); +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); +extern void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc); +extern void ext4_end_io_rsv_work(struct work_struct *work); +extern void ext4_end_io_unrsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, @@ -2633,20 +2728,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp); extern int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp); -/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ +/* + * Note that these flags will never ever appear in a buffer_head's state flag. + * See EXT4_MAP_... to see where this is used. + */ enum ext4_state_bits { BH_Uninit /* blocks are allocated but uninitialized on disk */ - = BH_JBDPrivateStart, + = BH_JBDPrivateStart, BH_AllocFromCluster, /* allocated blocks were part of already - * allocated cluster. Note that this flag will - * never, ever appear in a buffer_head's state - * flag. See EXT4_MAP_FROM_CLUSTER to see where - * this is used. */ + * allocated cluster. */ }; -BUFFER_FNS(Uninit, uninit) -TAS_BUFFER_FNS(Uninit, uninit) - /* * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 8643ff5bbeb7..51bc821ade90 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -270,5 +270,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, 0xffff); } +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) +int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, + struct inode *inode, struct ext4_ext_path *path); + #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 7058975e3a55..72a3600aedbd 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -38,29 +38,43 @@ static void ext4_put_nojournal(handle_t *handle) /* * Wrappers for jbd2_journal_start/end. */ -handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, - int type, int nblocks) +static int ext4_journal_check_start(struct super_block *sb) { journal_t *journal; - trace_ext4_journal_start(sb, nblocks, _RET_IP_); + might_sleep(); if (sb->s_flags & MS_RDONLY) - return ERR_PTR(-EROFS); - + return -EROFS; WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; - if (!journal) - return ext4_get_nojournal(); /* * Special case here: if the journal has aborted behind our * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ - if (is_journal_aborted(journal)) { + if (journal && is_journal_aborted(journal)) { ext4_abort(sb, "Detected aborted journal"); - return ERR_PTR(-EROFS); + return -EROFS; } - return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line); + return 0; +} + +handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, + int type, int blocks, int rsv_blocks) +{ + journal_t *journal; + int err; + + trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_); + err = ext4_journal_check_start(sb); + if (err < 0) + return ERR_PTR(err); + + journal = EXT4_SB(sb)->s_journal; + if (!journal) + return ext4_get_nojournal(); + return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS, + type, line); } int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) @@ -84,6 +98,30 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) return err; } +handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, + int type) +{ + struct super_block *sb; + int err; + + if (!ext4_handle_valid(handle)) + return ext4_get_nojournal(); + + sb = handle->h_journal->j_private; + trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits, + _RET_IP_); + err = ext4_journal_check_start(sb); + if (err < 0) { + jbd2_journal_free_reserved(handle); + return ERR_PTR(err); + } + + err = jbd2_journal_start_reserved(handle, type, line); + if (err < 0) + return ERR_PTR(err); + return handle; +} + void ext4_journal_abort_handle(const char *caller, unsigned int line, const char *err_fn, struct buffer_head *bh, handle_t *handle, int err) @@ -113,6 +151,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, { int err = 0; + might_sleep(); + if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) @@ -209,6 +249,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, { int err = 0; + might_sleep(); + + set_buffer_meta(bh); + set_buffer_prio(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); if (err) { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 4c216b1bf20c..2877258d9497 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -29,11 +29,13 @@ * block to complete the transaction. * * For extents-enabled fs we may have to allocate and modify up to - * 5 levels of tree + root which are stored in the inode. */ + * 5 levels of tree, data block (for each of these we need bitmap + group + * summaries), root which is stored in the inode, sb + */ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ - ? 27U : 8U) + ? 20U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode @@ -132,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode) #define EXT4_HT_MIGRATE 8 #define EXT4_HT_MOVE_EXTENTS 9 #define EXT4_HT_XATTR 10 -#define EXT4_HT_MAX 11 +#define EXT4_HT_EXT_CONVERT 11 +#define EXT4_HT_MAX 12 /** * struct ext4_journal_cb_entry - Base structure for callback information. @@ -194,16 +197,20 @@ static inline void ext4_journal_callback_add(handle_t *handle, * ext4_journal_callback_del: delete a registered callback * @handle: active journal transaction handle on which callback was registered * @jce: registered journal callback entry to unregister + * Return true if object was sucessfully removed */ -static inline void ext4_journal_callback_del(handle_t *handle, +static inline bool ext4_journal_callback_try_del(handle_t *handle, struct ext4_journal_cb_entry *jce) { + bool deleted; struct ext4_sb_info *sbi = EXT4_SB(handle->h_transaction->t_journal->j_private); spin_lock(&sbi->s_md_lock); + deleted = !list_empty(&jce->jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); + return deleted; } int @@ -259,7 +266,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line, __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, - int type, int nblocks); + int type, int blocks, int rsv_blocks); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -294,21 +301,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) } #define ext4_journal_start_sb(sb, type, nblocks) \ - __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks)) + __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0) #define ext4_journal_start(inode, type, nblocks) \ - __ext4_journal_start((inode), __LINE__, (type), (nblocks)) + __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0) + +#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \ + __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks)) static inline handle_t *__ext4_journal_start(struct inode *inode, unsigned int line, int type, - int nblocks) + int blocks, int rsv_blocks) { - return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks); + return __ext4_journal_start_sb(inode->i_sb, line, type, blocks, + rsv_blocks); } #define ext4_journal_stop(handle) \ __ext4_journal_stop(__func__, __LINE__, (handle)) +#define ext4_journal_start_reserved(handle, type) \ + __ext4_journal_start_reserved((handle), __LINE__, (type)) + +handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, + int type); + +static inline void ext4_journal_free_reserved(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + jbd2_journal_free_reserved(handle); +} + static inline handle_t *ext4_journal_current_handle(void) { return journal_current_handle(); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9c6d06dcef8b..7097b0f680e6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -157,11 +157,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, * - ENOMEM * - EIO */ -#define ext4_ext_dirty(handle, inode, path) \ - __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) -static int __ext4_ext_dirty(const char *where, unsigned int line, - handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) +int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, + struct inode *inode, struct ext4_ext_path *path) { int err; if (path->p_bh) { @@ -1813,39 +1810,101 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, } depth = ext_depth(inode); ex = path[depth].p_ext; + eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); return -EIO; } /* try to insert block into found extent and return */ - if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) - && ext4_can_extents_be_merged(inode, ex, newext)) { - ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n", - ext4_ext_is_uninitialized(newext), - ext4_ext_get_actual_len(newext), - le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), - ext4_ext_get_actual_len(ex), - ext4_ext_pblock(ex)); - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - return err; + if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) { /* - * ext4_can_extents_be_merged should have checked that either - * both extents are uninitialized, or both aren't. Thus we - * need to check only one of them here. + * Try to see whether we should rather test the extent on + * right from ex, or from the left of ex. This is because + * ext4_ext_find_extent() can return either extent on the + * left, or on the right from the searched position. This + * will make merging more effective. */ - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; - ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + if (ex < EXT_LAST_EXTENT(eh) && + (le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex) < + le32_to_cpu(newext->ee_block))) { + ex += 1; + goto prepend; + } else if ((ex > EXT_FIRST_EXTENT(eh)) && + (le32_to_cpu(newext->ee_block) + + ext4_ext_get_actual_len(newext) < + le32_to_cpu(ex->ee_block))) + ex -= 1; + + /* Try to append newex to the ex */ + if (ext4_can_extents_be_merged(inode, ex, newext)) { + ext_debug("append [%d]%d block to %u:[%d]%d" + "(from %llu)\n", + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + return err; + + /* + * ext4_can_extents_be_merged should have checked + * that either both extents are uninitialized, or + * both aren't. Thus we need to check only one of + * them here. + */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); - if (uninitialized) - ext4_ext_mark_uninitialized(ex); - eh = path[depth].p_hdr; - nearex = ex; - goto merge; + if (uninitialized) + ext4_ext_mark_uninitialized(ex); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } + +prepend: + /* Try to prepend newex to the ex */ + if (ext4_can_extents_be_merged(inode, newext, ex)) { + ext_debug("prepend %u[%d]%d block to %u:[%d]%d" + "(from %llu)\n", + le32_to_cpu(newext->ee_block), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + return err; + + /* + * ext4_can_extents_be_merged should have checked + * that either both extents are uninitialized, or + * both aren't. Thus we need to check only one of + * them here. + */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_block = newext->ee_block; + ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if (uninitialized) + ext4_ext_mark_uninitialized(ex); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } } depth = ext_depth(inode); @@ -1880,8 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ - if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) - flags = EXT4_MB_USE_ROOT_BLOCKS; + if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL) + flags = EXT4_MB_USE_RESERVED; err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); if (err) goto cleanup; @@ -2066,7 +2125,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode, next_del = ext4_find_delayed_extent(inode, &es); if (!exists && next_del) { exists = 1; - flags |= FIEMAP_EXTENT_DELALLOC; + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); } up_read(&EXT4_I(inode)->i_data_sem); @@ -2269,17 +2329,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, } /* - * How many index/leaf blocks need to change/allocate to modify nrblocks? + * How many index/leaf blocks need to change/allocate to add @extents extents? * - * if nrblocks are fit in a single extent (chunk flag is 1), then - * in the worse case, each tree level index/leaf need to be changed - * if the tree split due to insert a new extent, then the old tree - * index/leaf need to be updated too + * If we add a single extent, then in the worse case, each tree level + * index/leaf need to be changed in case of the tree split. * - * If the nrblocks are discontiguous, they could cause - * the whole tree split more than once, but this is really rare. + * If more extents are inserted, they could cause the whole tree split more + * than once, but this is really rare. */ -int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; int depth; @@ -2290,7 +2348,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) depth = ext_depth(inode); - if (chunk) + if (extents <= 1) index = depth * 2; else index = depth * 3; @@ -2298,20 +2356,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) return index; } +static inline int get_default_free_blocks_flags(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; + else if (ext4_should_journal_data(inode)) + return EXT4_FREE_BLOCKS_FORGET; + return 0; +} + static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, - ext4_fsblk_t *partial_cluster, + long long *partial_cluster, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); ext4_fsblk_t pblk; - int flags = 0; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; - else if (ext4_should_journal_data(inode)) - flags |= EXT4_FREE_BLOCKS_FORGET; + int flags = get_default_free_blocks_flags(inode); /* * For bigalloc file systems, we never free a partial cluster @@ -2329,7 +2391,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * partial cluster here. */ pblk = ext4_ext_pblock(ex) + ee_len - 1; - if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { + if ((*partial_cluster > 0) && + (EXT4_B2C(sbi, pblk) != *partial_cluster)) { ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, *partial_cluster), sbi->s_cluster_ratio, flags); @@ -2355,41 +2418,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { /* tail removal */ ext4_lblk_t num; + unsigned int unaligned; num = le32_to_cpu(ex->ee_block) + ee_len - from; pblk = ext4_ext_pblock(ex) + ee_len - num; - ext_debug("free last %u blocks starting %llu\n", num, pblk); + /* + * Usually we want to free partial cluster at the end of the + * extent, except for the situation when the cluster is still + * used by any other extent (partial_cluster is negative). + */ + if (*partial_cluster < 0 && + -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1)) + flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; + + ext_debug("free last %u blocks starting %llu partial %lld\n", + num, pblk, *partial_cluster); ext4_free_blocks(handle, inode, NULL, pblk, num, flags); /* * If the block range to be freed didn't start at the * beginning of a cluster, and we removed the entire - * extent, save the partial cluster here, since we - * might need to delete if we determine that the - * truncate operation has removed all of the blocks in - * the cluster. + * extent and the cluster is not used by any other extent, + * save the partial cluster here, since we might need to + * delete if we determine that the truncate operation has + * removed all of the blocks in the cluster. + * + * On the other hand, if we did not manage to free the whole + * extent, we have to mark the cluster as used (store negative + * cluster number in partial_cluster). */ - if (pblk & (sbi->s_cluster_ratio - 1) && - (ee_len == num)) + unaligned = pblk & (sbi->s_cluster_ratio - 1); + if (unaligned && (ee_len == num) && + (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) *partial_cluster = EXT4_B2C(sbi, pblk); - else + else if (unaligned) + *partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); + else if (*partial_cluster > 0) *partial_cluster = 0; - } else if (from == le32_to_cpu(ex->ee_block) - && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { - /* head removal */ - ext4_lblk_t num; - ext4_fsblk_t start; - - num = to - from; - start = ext4_ext_pblock(ex); - - ext_debug("free first %u blocks starting %llu\n", num, start); - ext4_free_blocks(handle, inode, NULL, start, num, flags); - - } else { - printk(KERN_INFO "strange request: removal(2) " - "%u-%u from %u:%u\n", - from, to, le32_to_cpu(ex->ee_block), ee_len); - } + } else + ext4_error(sbi->s_sb, "strange request: removal(2) " + "%u-%u from %u:%u\n", + from, to, le32_to_cpu(ex->ee_block), ee_len); return 0; } @@ -2402,12 +2470,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * @handle: The journal handle * @inode: The files inode * @path: The path to the leaf + * @partial_cluster: The cluster which we'll have to free if all extents + * has been released from it. It gets negative in case + * that the cluster is still used. * @start: The first block to remove * @end: The last block to remove */ static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, + struct ext4_ext_path *path, + long long *partial_cluster, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2420,6 +2492,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, unsigned short ex_ee_len; unsigned uninitialized = 0; struct ext4_extent *ex; + ext4_fsblk_t pblk; /* the header must be checked already in ext4_ext_remove_space() */ ext_debug("truncate since %u in leaf to %u\n", start, end); @@ -2431,7 +2504,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, return -EIO; } /* find where to start removing */ - ex = EXT_LAST_EXTENT(eh); + ex = path[depth].p_ext; + if (!ex) + ex = EXT_LAST_EXTENT(eh); ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); @@ -2458,6 +2533,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, /* If this extent is beyond the end of the hole, skip it */ if (end < ex_ee_block) { + /* + * We're going to skip this extent and move to another, + * so if this extent is not cluster aligned we have + * to mark the current cluster as used to avoid + * accidentally freeing it later on + */ + pblk = ext4_ext_pblock(ex); + if (pblk & (sbi->s_cluster_ratio - 1)) + *partial_cluster = + -((long long)EXT4_B2C(sbi, pblk)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); @@ -2533,7 +2618,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, sizeof(struct ext4_extent)); } le16_add_cpu(&eh->eh_entries, -1); - } else + } else if (*partial_cluster > 0) *partial_cluster = 0; err = ext4_ext_dirty(handle, inode, path + depth); @@ -2551,17 +2636,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, err = ext4_ext_correct_indexes(handle, inode, path); /* - * If there is still a entry in the leaf node, check to see if - * it references the partial cluster. This is the only place - * where it could; if it doesn't, we can free the cluster. + * Free the partial cluster only if the current extent does not + * reference it. Otherwise we might free used cluster. */ - if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && + if (*partial_cluster > 0 && (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != *partial_cluster)) { - int flags = EXT4_FREE_BLOCKS_FORGET; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; + int flags = get_default_free_blocks_flags(inode); ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, *partial_cluster), @@ -2599,13 +2680,13 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t end) +int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); struct ext4_ext_path *path = NULL; - ext4_fsblk_t partial_cluster = 0; + long long partial_cluster = 0; handle_t *handle; int i = 0, err = 0; @@ -2617,7 +2698,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, return PTR_ERR(handle); again: - trace_ext4_ext_remove_space(inode, start, depth); + trace_ext4_ext_remove_space(inode, start, end, depth); /* * Check if we are removing extents inside the extent tree. If that @@ -2667,12 +2748,14 @@ again: /* * Split the extent in two so that 'end' is the last - * block in the first new extent + * block in the first new extent. Also we should not + * fail removing space due to ENOSPC so try to use + * reserved block if that happens. */ err = ext4_split_extent_at(handle, inode, path, - end + 1, split_flag, - EXT4_GET_BLOCKS_PRE_IO | - EXT4_GET_BLOCKS_PUNCH_OUT_EXT); + end + 1, split_flag, + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_METADATA_NOFAIL); if (err < 0) goto out; @@ -2783,17 +2866,14 @@ again: } } - trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, - path->p_hdr->eh_entries); + trace_ext4_ext_remove_space_done(inode, start, end, depth, + partial_cluster, path->p_hdr->eh_entries); /* If we still have something in the partial cluster and we have removed * even the first extent, then we should free the blocks in the partial * cluster as well. */ - if (partial_cluster && path->p_hdr->eh_entries == 0) { - int flags = EXT4_FREE_BLOCKS_FORGET; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; + if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) { + int flags = get_default_free_blocks_flags(inode); ext4_free_blocks(handle, inode, NULL, EXT4_C2B(EXT4_SB(sb), partial_cluster), @@ -3147,35 +3227,35 @@ out: static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path) + struct ext4_ext_path *path, + int flags) { struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex; - struct ext4_extent *ex; + struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; - unsigned int ee_len, depth; - int allocated, max_zeroout = 0; + unsigned int ee_len, depth, map_len = map->m_len; + int allocated = 0, max_zeroout = 0; int err = 0; int split_flag = 0; ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map->m_len); + (unsigned long long)map->m_lblk, map_len); sbi = EXT4_SB(inode->i_sb); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; - if (eof_block < map->m_lblk + map->m_len) - eof_block = map->m_lblk + map->m_len; + if (eof_block < map->m_lblk + map_len) + eof_block = map->m_lblk + map_len; depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - allocated = ee_len - (map->m_lblk - ee_block); zero_ex.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3186,77 +3266,121 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, /* * Attempt to transfer newly initialized blocks from the currently - * uninitialized extent to its left neighbor. This is much cheaper + * uninitialized extent to its neighbor. This is much cheaper * than an insertion followed by a merge as those involve costly - * memmove() calls. This is the common case in steady state for - * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append - * writes. + * memmove() calls. Transferring to the left is the common case in + * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) + * followed by append writes. * * Limitations of the current logic: - * - L1: we only deal with writes at the start of the extent. - * The approach could be extended to writes at the end - * of the extent but this scenario was deemed less common. - * - L2: we do not deal with writes covering the whole extent. + * - L1: we do not deal with writes covering the whole extent. * This would require removing the extent if the transfer * is possible. - * - L3: we only attempt to merge with an extent stored in the + * - L2: we only attempt to merge with an extent stored in the * same extent tree node. */ - if ((map->m_lblk == ee_block) && /*L1*/ - (map->m_len < ee_len) && /*L2*/ - (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ - struct ext4_extent *prev_ex; + if ((map->m_lblk == ee_block) && + /* See if we can merge left */ + (map_len < ee_len) && /*L1*/ + (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ ext4_lblk_t prev_lblk; ext4_fsblk_t prev_pblk, ee_pblk; - unsigned int prev_len, write_len; + unsigned int prev_len; - prev_ex = ex - 1; - prev_lblk = le32_to_cpu(prev_ex->ee_block); - prev_len = ext4_ext_get_actual_len(prev_ex); - prev_pblk = ext4_ext_pblock(prev_ex); + abut_ex = ex - 1; + prev_lblk = le32_to_cpu(abut_ex->ee_block); + prev_len = ext4_ext_get_actual_len(abut_ex); + prev_pblk = ext4_ext_pblock(abut_ex); ee_pblk = ext4_ext_pblock(ex); - write_len = map->m_len; /* - * A transfer of blocks from 'ex' to 'prev_ex' is allowed + * A transfer of blocks from 'ex' to 'abut_ex' is allowed * upon those conditions: - * - C1: prev_ex is initialized, - * - C2: prev_ex is logically abutting ex, - * - C3: prev_ex is physically abutting ex, - * - C4: prev_ex can receive the additional blocks without + * - C1: abut_ex is initialized, + * - C2: abut_ex is logically abutting ex, + * - C3: abut_ex is physically abutting ex, + * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ - if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ + if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ - (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ + (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; trace_ext4_ext_convert_to_initialized_fastpath(inode, - map, ex, prev_ex); + map, ex, abut_ex); - /* Shift the start of ex by 'write_len' blocks */ - ex->ee_block = cpu_to_le32(ee_block + write_len); - ext4_ext_store_pblock(ex, ee_pblk + write_len); - ex->ee_len = cpu_to_le16(ee_len - write_len); + /* Shift the start of ex by 'map_len' blocks */ + ex->ee_block = cpu_to_le32(ee_block + map_len); + ext4_ext_store_pblock(ex, ee_pblk + map_len); + ex->ee_len = cpu_to_le16(ee_len - map_len); ext4_ext_mark_uninitialized(ex); /* Restore the flag */ - /* Extend prev_ex by 'write_len' blocks */ - prev_ex->ee_len = cpu_to_le16(prev_len + write_len); + /* Extend abut_ex by 'map_len' blocks */ + abut_ex->ee_len = cpu_to_le16(prev_len + map_len); + + /* Result: number of initialized blocks past m_lblk */ + allocated = map_len; + } + } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && + (map_len < ee_len) && /*L1*/ + ex < EXT_LAST_EXTENT(eh)) { /*L2*/ + /* See if we can merge right */ + ext4_lblk_t next_lblk; + ext4_fsblk_t next_pblk, ee_pblk; + unsigned int next_len; + + abut_ex = ex + 1; + next_lblk = le32_to_cpu(abut_ex->ee_block); + next_len = ext4_ext_get_actual_len(abut_ex); + next_pblk = ext4_ext_pblock(abut_ex); + ee_pblk = ext4_ext_pblock(ex); + + /* + * A transfer of blocks from 'ex' to 'abut_ex' is allowed + * upon those conditions: + * - C1: abut_ex is initialized, + * - C2: abut_ex is logically abutting ex, + * - C3: abut_ex is physically abutting ex, + * - C4: abut_ex can receive the additional blocks without + * overflowing the (initialized) length limit. + */ + if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ + ((map->m_lblk + map_len) == next_lblk) && /*C2*/ + ((ee_pblk + ee_len) == next_pblk) && /*C3*/ + (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + trace_ext4_ext_convert_to_initialized_fastpath(inode, + map, ex, abut_ex); - /* Mark the block containing both extents as dirty */ - ext4_ext_dirty(handle, inode, path + depth); + /* Shift the start of abut_ex by 'map_len' blocks */ + abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); + ext4_ext_store_pblock(abut_ex, next_pblk - map_len); + ex->ee_len = cpu_to_le16(ee_len - map_len); + ext4_ext_mark_uninitialized(ex); /* Restore the flag */ - /* Update path to point to the right extent */ - path[depth].p_ext = prev_ex; + /* Extend abut_ex by 'map_len' blocks */ + abut_ex->ee_len = cpu_to_le16(next_len + map_len); /* Result: number of initialized blocks past m_lblk */ - allocated = write_len; - goto out; + allocated = map_len; } } + if (allocated) { + /* Mark the block containing both extents as dirty */ + ext4_ext_dirty(handle, inode, path + depth); + + /* Update path to point to the right extent */ + path[depth].p_ext = abut_ex; + goto out; + } else + allocated = ee_len - (map->m_lblk - ee_block); WARN_ON(map->m_lblk < ee_block); /* @@ -3330,7 +3454,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } allocated = ext4_split_extent(handle, inode, path, - &split_map, split_flag, 0); + &split_map, split_flag, flags); if (allocated < 0) err = allocated; @@ -3537,7 +3661,7 @@ int ext4_find_delalloc_range(struct inode *inode, { struct extent_status es; - ext4_es_find_delayed_extent(inode, lblk_start, &es); + ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); if (es.es_len == 0) return 0; /* there is no delay extent in this tree */ else if (es.es_lblk <= lblk_start && @@ -3650,6 +3774,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, flags, allocated); ext4_ext_show_leaf(inode, path); + /* + * When writing into uninitialized space, we should not fail to + * allocate metadata blocks for the new extent block if needed. + */ + flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; + trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, allocated, newblock); @@ -3713,7 +3843,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, } /* buffered write, writepage time, convert*/ - ret = ext4_ext_convert_to_initialized(handle, inode, map, path); + ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out: @@ -4252,53 +4382,18 @@ out2: } out3: - trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); + trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); return err ? err : allocated; } -void ext4_ext_truncate(struct inode *inode) +void ext4_ext_truncate(handle_t *handle, struct inode *inode) { - struct address_space *mapping = inode->i_mapping; struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; - handle_t *handle; - loff_t page_len; int err = 0; /* - * finish any pending end_io work so we won't run the risk of - * converting any truncated blocks to initialized later - */ - ext4_flush_unwritten_io(inode); - - /* - * probably first extent we're gonna free will be last in block - */ - err = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err); - if (IS_ERR(handle)) - return; - - if (inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out_stop; - } - - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - down_write(&EXT4_I(inode)->i_data_sem); - - ext4_discard_preallocations(inode); - - /* * TODO: optimization is possible here. * Probably we need not scan at all, * because page truncation is enough. @@ -4313,29 +4408,6 @@ void ext4_ext_truncate(struct inode *inode) err = ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); - - /* In a multi-transaction truncate, we only make the final - * transaction synchronous. - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - up_write(&EXT4_I(inode)->i_data_sem); - -out_stop: - /* - * If this was a simple ftruncate() and the file will remain alive, - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); } static void ext4_falloc_update_inode(struct inode *inode, @@ -4393,7 +4465,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) - return ext4_punch_hole(file, offset, len); + return ext4_punch_hole(inode, offset, len); ret = ext4_convert_inline_data(inode); if (ret) @@ -4495,10 +4567,9 @@ retry: * function, to convert the fallocated extents after IO is completed. * Returns 0 on success. */ -int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, - ssize_t len) +int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len) { - handle_t *handle; unsigned int max_blocks; int ret = 0; int ret2 = 0; @@ -4513,16 +4584,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - map.m_lblk); /* - * credits to insert 1 extent into extent tree + * This is somewhat ugly but the idea is clear: When transaction is + * reserved, everything goes into it. Otherwise we rather start several + * smaller transactions for conversion of each extent separately. */ - credits = ext4_chunk_trans_blocks(inode, max_blocks); + if (handle) { + handle = ext4_journal_start_reserved(handle, + EXT4_HT_EXT_CONVERT); + if (IS_ERR(handle)) + return PTR_ERR(handle); + credits = 0; + } else { + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + } while (ret >= 0 && ret < max_blocks) { map.m_lblk += ret; map.m_len = (max_blocks -= ret); - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - break; + if (credits) { + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } } ret = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_IO_CONVERT_EXT); @@ -4533,10 +4620,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, inode->i_ino, map.m_lblk, map.m_len, ret); ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); - if (ret <= 0 || ret2 ) + if (credits) + ret2 = ext4_journal_stop(handle); + if (ret <= 0 || ret2) break; } + if (!credits) + ret2 = ext4_journal_stop(handle); return ret > 0 ? ret2 : ret; } @@ -4555,9 +4645,10 @@ static int ext4_find_delayed_extent(struct inode *inode, struct extent_status es; ext4_lblk_t block, next_del; - ext4_es_find_delayed_extent(inode, newes->es_lblk, &es); - if (newes->es_pblk == 0) { + ext4_es_find_delayed_extent_range(inode, newes->es_lblk, + newes->es_lblk + newes->es_len - 1, &es); + /* * No extent in extent-tree contains block @newes->es_pblk, * then the block may stay in 1)a hole or 2)delayed-extent. @@ -4577,7 +4668,7 @@ static int ext4_find_delayed_extent(struct inode *inode, } block = newes->es_lblk + newes->es_len; - ext4_es_find_delayed_extent(inode, block, &es); + ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); if (es.es_len == 0) next_del = EXT_MAX_BLOCKS; else @@ -4605,7 +4696,7 @@ static int ext4_xattr_fiemap(struct inode *inode, error = ext4_get_inode_loc(inode, &iloc); if (error) return error; - physical = iloc.bh->b_blocknr << blockbits; + physical = (__u64)iloc.bh->b_blocknr << blockbits; offset = EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize; physical += offset; @@ -4613,7 +4704,7 @@ static int ext4_xattr_fiemap(struct inode *inode, flags |= FIEMAP_EXTENT_DATA_INLINE; brelse(iloc.bh); } else { /* external block */ - physical = EXT4_I(inode)->i_file_acl << blockbits; + physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; length = inode->i_sb->s_blocksize; } @@ -4623,187 +4714,6 @@ static int ext4_xattr_fiemap(struct inode *inode, return (error < 0 ? error : 0); } -/* - * ext4_ext_punch_hole - * - * Punches a hole of "length" bytes in a file starting - * at byte "offset" - * - * @inode: The inode of the file to punch a hole in - * @offset: The starting byte offset of the hole - * @length: The length of the hole - * - * Returns the number of blocks removed or negative on err - */ -int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) -{ - struct inode *inode = file_inode(file); - struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - handle_t *handle; - loff_t first_page, last_page, page_len; - loff_t first_page_offset, last_page_offset; - int credits, err = 0; - - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - err = filemap_write_and_wait_range(mapping, - offset, offset + length - 1); - - if (err) - return err; - } - - mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - err = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { - err = -ETXTBSY; - goto out_mutex; - } - - /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) - goto out_mutex; - - /* - * If the hole extends beyond i_size, set the hole - * to end after the page that contains i_size - */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - - offset; - } - - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - last_page = (offset + length) >> PAGE_CACHE_SHIFT; - - first_page_offset = first_page << PAGE_CACHE_SHIFT; - last_page_offset = last_page << PAGE_CACHE_SHIFT; - - /* Now release the pages */ - if (last_page_offset > first_page_offset) { - truncate_pagecache_range(inode, first_page_offset, - last_page_offset - 1); - } - - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - err = ext4_flush_unwritten_io(inode); - if (err) - goto out_dio; - inode_dio_wait(inode); - - credits = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto out_dio; - } - - - /* - * Now we need to zero out the non-page-aligned data in the - * pages at the start and tail of the hole, and unmap the buffer - * heads for the block aligned regions of the page that were - * completely zeroed. - */ - if (first_page > last_page) { - /* - * If the file space being truncated is contained within a page - * just zero out and unmap the middle of that page - */ - err = ext4_discard_partial_page_buffers(handle, - mapping, offset, length, 0); - - if (err) - goto out; - } else { - /* - * zero out and unmap the partial page that contains - * the start of the hole - */ - page_len = first_page_offset - offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - offset, page_len, 0); - if (err) - goto out; - } - - /* - * zero out and unmap the partial page that contains - * the end of the hole - */ - page_len = offset + length - last_page_offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - last_page_offset, page_len, 0); - if (err) - goto out; - } - } - - /* - * If i_size is contained in the last page, we need to - * unmap and zero the partial page after i_size - */ - if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && - inode->i_size % PAGE_CACHE_SIZE != 0) { - - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out; - } - } - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - - /* If there are no blocks to remove, return now */ - if (first_block >= stop_block) - goto out; - - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); - - err = ext4_es_remove_extent(inode, first_block, - stop_block - first_block); - err = ext4_ext_remove_space(inode, first_block, stop_block - 1); - - ext4_discard_preallocations(inode); - - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - up_write(&EXT4_I(inode)->i_data_sem); - -out: - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); -out_dio: - ext4_inode_resume_unlocked_dio(inode); -out_mutex: - mutex_unlock(&inode->i_mutex); - return err; -} - int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index fe3337a85ede..ee018d5f397e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -10,6 +10,7 @@ * Ext4 extents status tree core functions. */ #include <linux/rbtree.h> +#include <linux/list_sort.h> #include "ext4.h" #include "extents_status.h" #include "ext4_extents.h" @@ -232,14 +233,16 @@ static struct extent_status *__es_tree_search(struct rb_root *root, } /* - * ext4_es_find_delayed_extent: find the 1st delayed extent covering @es->lblk - * if it exists, otherwise, the next extent after @es->lblk. + * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering + * @es->lblk if it exists, otherwise, the next extent after @es->lblk. * * @inode: the inode which owns delayed extents * @lblk: the offset where we start to search + * @end: the offset where we stop to search * @es: delayed extent that we found */ -void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, +void ext4_es_find_delayed_extent_range(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es) { struct ext4_es_tree *tree = NULL; @@ -247,7 +250,8 @@ void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, struct rb_node *node; BUG_ON(es == NULL); - trace_ext4_es_find_delayed_extent_enter(inode, lblk); + BUG_ON(end < lblk); + trace_ext4_es_find_delayed_extent_range_enter(inode, lblk); read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; @@ -270,6 +274,10 @@ out: if (es1 && !ext4_es_is_delayed(es1)) { while ((node = rb_next(&es1->rb_node)) != NULL) { es1 = rb_entry(node, struct extent_status, rb_node); + if (es1->es_lblk > end) { + es1 = NULL; + break; + } if (ext4_es_is_delayed(es1)) break; } @@ -284,8 +292,7 @@ out: read_unlock(&EXT4_I(inode)->i_es_lock); - ext4_es_lru_add(inode); - trace_ext4_es_find_delayed_extent_exit(inode, es); + trace_ext4_es_find_delayed_extent_range_exit(inode, es); } static struct extent_status * @@ -665,7 +672,6 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, error: write_unlock(&EXT4_I(inode)->i_es_lock); - ext4_es_lru_add(inode); ext4_es_print_tree(inode); return err; @@ -727,7 +733,6 @@ out: read_unlock(&EXT4_I(inode)->i_es_lock); - ext4_es_lru_add(inode); trace_ext4_es_lookup_extent_exit(inode, es, found); return found; } @@ -871,12 +876,28 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) EXTENT_STATUS_WRITTEN); } +static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, + struct list_head *b) +{ + struct ext4_inode_info *eia, *eib; + eia = list_entry(a, struct ext4_inode_info, i_es_lru); + eib = list_entry(b, struct ext4_inode_info, i_es_lru); + + if (eia->i_touch_when == eib->i_touch_when) + return 0; + if (time_after(eia->i_touch_when, eib->i_touch_when)) + return 1; + else + return -1; +} + static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct ext4_sb_info *sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); struct ext4_inode_info *ei; - struct list_head *cur, *tmp, scanned; + struct list_head *cur, *tmp; + LIST_HEAD(skiped); int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk = 0; @@ -886,23 +907,41 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) if (!nr_to_scan) return ret; - INIT_LIST_HEAD(&scanned); - spin_lock(&sbi->s_es_lru_lock); + + /* + * If the inode that is at the head of LRU list is newer than + * last_sorted time, that means that we need to sort this list. + */ + ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); + if (sbi->s_es_last_sorted < ei->i_touch_when) { + list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); + sbi->s_es_last_sorted = jiffies; + } + list_for_each_safe(cur, tmp, &sbi->s_es_lru) { - list_move_tail(cur, &scanned); + /* + * If we have already reclaimed all extents from extent + * status tree, just stop the loop immediately. + */ + if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) + break; ei = list_entry(cur, struct ext4_inode_info, i_es_lru); - read_lock(&ei->i_es_lock); - if (ei->i_es_lru_nr == 0) { - read_unlock(&ei->i_es_lock); + /* Skip the inode that is newer than the last_sorted time */ + if (sbi->s_es_last_sorted < ei->i_touch_when) { + list_move_tail(cur, &skiped); continue; } - read_unlock(&ei->i_es_lock); + + if (ei->i_es_lru_nr == 0) + continue; write_lock(&ei->i_es_lock); ret = __es_try_to_reclaim_extents(ei, nr_to_scan); + if (ei->i_es_lru_nr == 0) + list_del_init(&ei->i_es_lru); write_unlock(&ei->i_es_lock); nr_shrunk += ret; @@ -910,7 +949,9 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) if (nr_to_scan == 0) break; } - list_splice_tail(&scanned, &sbi->s_es_lru); + + /* Move the newer inodes into the tail of the LRU list. */ + list_splice_tail(&skiped, &sbi->s_es_lru); spin_unlock(&sbi->s_es_lru_lock); ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); @@ -918,21 +959,19 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) return ret; } -void ext4_es_register_shrinker(struct super_block *sb) +void ext4_es_register_shrinker(struct ext4_sb_info *sbi) { - struct ext4_sb_info *sbi; - - sbi = EXT4_SB(sb); INIT_LIST_HEAD(&sbi->s_es_lru); spin_lock_init(&sbi->s_es_lru_lock); + sbi->s_es_last_sorted = 0; sbi->s_es_shrinker.shrink = ext4_es_shrink; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; register_shrinker(&sbi->s_es_shrinker); } -void ext4_es_unregister_shrinker(struct super_block *sb) +void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) { - unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); + unregister_shrinker(&sbi->s_es_shrinker); } void ext4_es_lru_add(struct inode *inode) @@ -940,11 +979,14 @@ void ext4_es_lru_add(struct inode *inode) struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ei->i_touch_when = jiffies; + + if (!list_empty(&ei->i_es_lru)) + return; + spin_lock(&sbi->s_es_lru_lock); if (list_empty(&ei->i_es_lru)) list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); - else - list_move_tail(&ei->i_es_lru, &sbi->s_es_lru); spin_unlock(&sbi->s_es_lru_lock); } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index d8e2d4dc311e..e936730cc5b0 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -39,6 +39,7 @@ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) +struct ext4_sb_info; struct ext4_extent; struct extent_status { @@ -62,7 +63,8 @@ extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, unsigned long long status); extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); -extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, +extern void ext4_es_find_delayed_extent_range(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); @@ -118,8 +120,8 @@ static inline void ext4_es_store_status(struct extent_status *es, es->es_pblk = block; } -extern void ext4_es_register_shrinker(struct super_block *sb); -extern void ext4_es_unregister_shrinker(struct super_block *sb); +extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_lru_add(struct inode *inode); extern void ext4_es_lru_del(struct inode *inode); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 64848b595b24..6f4cc567c382 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -23,6 +23,7 @@ #include <linux/jbd2.h> #include <linux/mount.h> #include <linux/path.h> +#include <linux/aio.h> #include <linux/quotaops.h> #include <linux/pagevec.h> #include "ext4.h" @@ -311,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, blkbits = inode->i_sb->s_blocksize_bits; startoff = *offset; lastoff = startoff; - endoff = (map->m_lblk + map->m_len) << blkbits; + endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; index = startoff >> PAGE_CACHE_SHIFT; end = endoff >> PAGE_CACHE_SHIFT; @@ -456,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { if (last != start) - dataoff = last << blkbits; + dataoff = (loff_t)last << blkbits; break; } @@ -464,10 +465,10 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) * If there is a delay extent at this offset, * it will be as a data. */ - ext4_es_find_delayed_extent(inode, last, &es); + ext4_es_find_delayed_extent_range(inode, last, last, &es); if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { if (last != start) - dataoff = last << blkbits; + dataoff = (loff_t)last << blkbits; break; } @@ -485,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) } last++; - dataoff = last << blkbits; + dataoff = (loff_t)last << blkbits; } while (last <= end); mutex_unlock(&inode->i_mutex); @@ -493,17 +494,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) if (dataoff > isize) return -ENXIO; - if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) - return -EINVAL; - if (dataoff > maxsize) - return -EINVAL; - - if (dataoff != file->f_pos) { - file->f_pos = dataoff; - file->f_version = 0; - } - - return dataoff; + return vfs_setpos(file, dataoff, maxsize); } /* @@ -539,7 +530,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { last += ret; - holeoff = last << blkbits; + holeoff = (loff_t)last << blkbits; continue; } @@ -547,10 +538,10 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) * If there is a delay extent at this offset, * we will skip this extent. */ - ext4_es_find_delayed_extent(inode, last, &es); + ext4_es_find_delayed_extent_range(inode, last, last, &es); if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { last = es.es_lblk + es.es_len; - holeoff = last << blkbits; + holeoff = (loff_t)last << blkbits; continue; } @@ -565,7 +556,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) &map, &holeoff); if (!unwritten) { last += ret; - holeoff = last << blkbits; + holeoff = (loff_t)last << blkbits; continue; } } @@ -579,17 +570,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) if (holeoff > isize) holeoff = isize; - if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) - return -EINVAL; - if (holeoff > maxsize) - return -EINVAL; - - if (holeoff != file->f_pos) { - file->f_pos = holeoff; - file->f_version = 0; - } - - return holeoff; + return vfs_setpos(file, holeoff, maxsize); } /* diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 3278e64e57b6..a8bc47f75fa0 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode) return ret; } -/** - * __sync_file - generic_file_fsync without the locking and filemap_write - * @inode: inode to sync - * @datasync: only sync essential metadata if true - * - * This is just generic_file_fsync without the locking. This is needed for - * nojournal mode to make sure this inodes data/metadata makes it to disk - * properly. The i_mutex should be held already. - */ -static int __sync_inode(struct inode *inode, int datasync) -{ - int err; - int ret; - - ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY)) - return ret; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return ret; - - err = sync_inode_metadata(inode, 1); - if (ret == 0) - ret = err; - return ret; -} - /* * akpm: A new design for ext4_sync_file(). * @@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct inode *inode = file->f_mapping->host; struct ext4_inode_info *ei = EXT4_I(inode); journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - int ret, err; + int ret = 0, err; tid_t commit_tid; bool needs_barrier = false; @@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_ext4_sync_file_enter(file, datasync); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - mutex_lock(&inode->i_mutex); - - if (inode->i_sb->s_flags & MS_RDONLY) - goto out; - - ret = ext4_flush_unwritten_io(inode); - if (ret < 0) + if (inode->i_sb->s_flags & MS_RDONLY) { + /* Make sure that we read updated s_mount_flags value */ + smp_rmb(); + if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + ret = -EROFS; goto out; + } if (!journal) { - ret = __sync_inode(inode, datasync); + ret = generic_file_fsync(file, start, end, datasync); if (!ret && !hlist_empty(&inode->i_dentry)) ret = ext4_sync_parent(inode); goto out; } + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; /* * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. @@ -166,15 +139,13 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (journal->j_flags & JBD2_BARRIER && !jbd2_trans_will_send_data_barrier(journal, commit_tid)) needs_barrier = true; - jbd2_log_start_commit(journal, commit_tid); - ret = jbd2_log_wait_commit(journal, commit_tid); + ret = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); if (!ret) ret = err; } - out: - mutex_unlock(&inode->i_mutex); +out: trace_ext4_sync_file_exit(inode, ret); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 6c5bb8d993fe..f03598c6ffd3 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -166,7 +166,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) trace_ext4_load_inode_bitmap(sb, block_group); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); - submit_bh(READ, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { put_bh(bh); @@ -666,6 +666,23 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ei = EXT4_I(inode); sbi = EXT4_SB(sb); + /* + * Initalize owners and quota early so that we don't have to account + * for quota initialization worst case in standard inode creating + * transaction + */ + if (owner) { + inode->i_mode = mode; + i_uid_write(inode, owner[0]); + i_gid_write(inode, owner[1]); + } else if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(inode, dir, mode); + dquot_initialize(inode); + if (!goal) goal = sbi->s_inode_goal; @@ -697,7 +714,7 @@ got_group: gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) - goto fail; + goto out; /* * Check free inodes count before loading bitmap. @@ -711,7 +728,7 @@ got_group: brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); if (!inode_bitmap_bh) - goto fail; + goto out; repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) @@ -730,16 +747,20 @@ repeat_in_this_group: if (!handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, - handle_type, nblocks); + handle_type, nblocks, + 0); if (IS_ERR(handle)) { err = PTR_ERR(handle); - goto fail; + ext4_std_error(sb, err); + goto out; } } BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode_bitmap_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } ext4_lock_group(sb, group); ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); ext4_unlock_group(sb, group); @@ -755,8 +776,10 @@ repeat_in_this_group: got: BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && @@ -768,7 +791,8 @@ got: err = ext4_journal_get_write_access(handle, block_bitmap_bh); if (err) { brelse(block_bitmap_bh); - goto fail; + ext4_std_error(sb, err); + goto out; } BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); @@ -787,14 +811,18 @@ got: ext4_unlock_group(sb, group); brelse(block_bitmap_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } } BUFFER_TRACE(group_desc_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, group_desc_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { @@ -840,8 +868,10 @@ got: BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) @@ -851,16 +881,6 @@ got: flex_group = ext4_flex_group(sbi, group); atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); } - if (owner) { - inode->i_mode = mode; - i_uid_write(inode, owner[0]); - i_gid_write(inode, owner[1]); - } else if (test_opt(sb, GRPID)) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = dir->i_gid; - } else - inode_init_owner(inode, dir, mode); inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ @@ -889,7 +909,9 @@ got: * twice. */ err = -EIO; - goto fail; + ext4_error(sb, "failed to insert inode %lu: doubly allocated?", + inode->i_ino); + goto out; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; @@ -899,7 +921,6 @@ got: if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { __u32 csum; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, @@ -918,7 +939,6 @@ got: ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; - dquot_initialize(inode); err = dquot_alloc_inode(inode); if (err) goto fail_drop; @@ -952,24 +972,17 @@ got: ext4_debug("allocating inode %lu\n", inode->i_ino); trace_ext4_allocate_inode(inode, dir, mode); - goto really_out; -fail: - ext4_std_error(sb, err); -out: - iput(inode); - ret = ERR_PTR(err); -really_out: brelse(inode_bitmap_bh); return ret; fail_free_drop: dquot_free_inode(inode); - fail_drop: - dquot_drop(inode); - inode->i_flags |= S_NOQUOTA; clear_nlink(inode); unlock_new_inode(inode); +out: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; iput(inode); brelse(inode_bitmap_bh); return ERR_PTR(err); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index a04183127ef0..87b30cd357e7 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -20,6 +20,7 @@ * (sct@redhat.com), 1993, 1998 */ +#include <linux/aio.h> #include "ext4_jbd2.h" #include "truncate.h" #include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ @@ -292,131 +293,6 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, } /** - * ext4_alloc_blocks: multiple allocate blocks needed for a branch - * @handle: handle for this transaction - * @inode: inode which needs allocated blocks - * @iblock: the logical block to start allocated at - * @goal: preferred physical block of allocation - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * @blks: number of desired blocks - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, - * @err: on return it will store the error code - * - * This function will return the number of blocks allocated as - * requested by the passed-in parameters. - */ -static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) -{ - struct ext4_allocation_request ar; - int target, i; - unsigned long count = 0, blk_allocated = 0; - int index = 0; - ext4_fsblk_t current_block = 0; - int ret = 0; - - /* - * Here we try to allocate the requested multiple blocks at once, - * on a best-effort basis. - * To build a branch, we should allocate blocks for - * the indirect blocks(if not allocated yet), and at least - * the first direct block of this branch. That's the - * minimum number of blocks need to allocate(required) - */ - /* first we try to allocate the indirect blocks */ - target = indirect_blks; - while (target > 0) { - count = target; - /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_meta_blocks(handle, inode, goal, - 0, &count, err); - if (*err) - goto failed_out; - - if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + count %lu > %d!", - current_block, count, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - target -= count; - /* allocate blocks for indirect blocks */ - while (index < indirect_blks && count) { - new_blocks[index++] = current_block++; - count--; - } - if (count > 0) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - WARN(1, KERN_INFO "%s returned more blocks than " - "requested\n", __func__); - break; - } - } - - target = blks - count ; - blk_allocated = count; - if (!target) - goto allocated; - /* Now allocate data blocks */ - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.goal = goal; - ar.len = target; - ar.logical = iblock; - if (S_ISREG(inode->i_mode)) - /* enable in-core preallocation only for regular files */ - ar.flags = EXT4_MB_HINT_DATA; - - current_block = ext4_mb_new_blocks(handle, &ar, err); - if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + ar.len %d > %d!", - current_block, ar.len, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - if (*err && (target == blks)) { - /* - * if the allocation failed and we didn't allocate - * any blocks before - */ - goto failed_out; - } - if (!*err) { - if (target == blks) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - } - blk_allocated += ar.len; - } -allocated: - /* total number of blocks allocated for direct blocks */ - ret = blk_allocated; - *err = 0; - return ret; -failed_out: - for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - return ret; -} - -/** * ext4_alloc_branch - allocate and set up a chain of blocks. * @handle: handle for this transaction * @inode: owner @@ -448,60 +324,59 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, int *blks, ext4_fsblk_t goal, ext4_lblk_t *offsets, Indirect *branch) { - int blocksize = inode->i_sb->s_blocksize; - int i, n = 0; - int err = 0; - struct buffer_head *bh; - int num; - ext4_fsblk_t new_blocks[4]; - ext4_fsblk_t current_block; + struct ext4_allocation_request ar; + struct buffer_head * bh; + ext4_fsblk_t b, new_blocks[4]; + __le32 *p; + int i, j, err, len = 1; - num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, - *blks, new_blocks, &err); - if (err) - return err; - - branch[0].key = cpu_to_le32(new_blocks[0]); /* - * metadata blocks and data blocks are allocated. + * Set up for the direct block allocation */ - for (n = 1; n <= indirect_blks; n++) { - /* - * Get buffer_head for parent block, zero it out - * and set the pointer to new one, then send - * parent to disk. - */ - bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.len = *blks; + ar.logical = iblock; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + + for (i = 0; i <= indirect_blks; i++) { + if (i == indirect_blks) { + ar.goal = goal; + new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err); + } else + goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode, + goal, 0, NULL, &err); + if (err) { + i--; + goto failed; + } + branch[i].key = cpu_to_le32(new_blocks[i]); + if (i == 0) + continue; + + bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]); if (unlikely(!bh)) { err = -ENOMEM; goto failed; } - - branch[n].bh = bh; lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, bh); if (err) { - /* Don't brelse(bh) here; it's done in - * ext4_journal_forget() below */ unlock_buffer(bh); goto failed; } - memset(bh->b_data, 0, blocksize); - branch[n].p = (__le32 *) bh->b_data + offsets[n]; - branch[n].key = cpu_to_le32(new_blocks[n]); - *branch[n].p = branch[n].key; - if (n == indirect_blks) { - current_block = new_blocks[n]; - /* - * End of chain, update the last new metablock of - * the chain to point to the new allocated - * data blocks numbers - */ - for (i = 1; i < num; i++) - *(branch[n].p + i) = cpu_to_le32(++current_block); - } + memset(bh->b_data, 0, bh->b_size); + p = branch[i].p = (__le32 *) bh->b_data + offsets[i]; + b = new_blocks[i]; + + if (i == indirect_blks) + len = ar.len; + for (j = 0; j < len; j++) + *p++ = cpu_to_le32(b++); + BUFFER_TRACE(bh, "marking uptodate"); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -511,25 +386,16 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, if (err) goto failed; } - *blks = num; - return err; + *blks = ar.len; + return 0; failed: - /* Allocation failed, free what we already allocated */ - ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); - for (i = 1; i <= n ; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, - EXT4_FREE_BLOCKS_FORGET); + for (; i >= 0; i--) { + if (i != indirect_blks && branch[i].bh) + ext4_forget(handle, 1, inode, branch[i].bh, + branch[i].bh->b_blocknr); + ext4_free_blocks(handle, inode, NULL, new_blocks[i], + (i == indirect_blks) ? ar.len : 1, 0); } - for (i = n+1; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - - ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); - return err; } @@ -758,7 +624,7 @@ cleanup: partial--; } out: - trace_ext4_ind_map_blocks_exit(inode, map, err); + trace_ext4_ind_map_blocks_exit(inode, flags, map, err); return err; } @@ -809,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, retry: if (rw == READ && ext4_should_dioread_nolock(inode)) { - if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) { - mutex_lock(&inode->i_mutex); - ext4_flush_unwritten_io(inode); - mutex_unlock(&inode->i_mutex); - } /* * Nolock dioread optimization may be dynamically disabled * via ext4_inode_block_unlocked_dio(). Check inode's state @@ -913,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; } -int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) +/* + * Calculate number of indirect blocks touched by mapping @nrblocks logically + * contiguous blocks + */ +int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) { - int indirects; - - /* if nrblocks are contiguous */ - if (chunk) { - /* - * With N contiguous data blocks, we need at most - * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, - * 2 dindirect blocks, and 1 tindirect block - */ - return DIV_ROUND_UP(nrblocks, - EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; - } /* - * if nrblocks are not contiguous, worse case, each block touch - * a indirect block, and each indirect block touch a double indirect - * block, plus a triple indirect block + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block */ - indirects = nrblocks * 2 + 1; - return indirects; + return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; } /* @@ -941,26 +793,9 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) * be able to restart the transaction at a conventient checkpoint to make * sure we don't overflow the journal. * - * start_transaction gets us a new handle for a truncate transaction, - * and extend_transaction tries to extend the existing one a bit. If + * Try to extend this transaction for the purposes of truncation. If * extend fails, we need to propagate the failure up and restart the * transaction in the top-level truncate loop. --sct - */ -static handle_t *start_transaction(struct inode *inode) -{ - handle_t *result; - - result = ext4_journal_start(inode, EXT4_HT_TRUNCATE, - ext4_blocks_for_truncate(inode)); - if (!IS_ERR(result)) - return result; - - ext4_std_error(inode->i_sb, PTR_ERR(result)); - return result; -} - -/* - * Try to extend this transaction for the purposes of truncation. * * Returns 0 if we managed to create more room. If we can't create more * room, and the transaction must be restarted we return 1. @@ -1091,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, __le32 *last) { __le32 *p; - int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; + int flags = EXT4_FREE_BLOCKS_VALIDATED; int err; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; + flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; + else if (ext4_should_journal_data(inode)) + flags |= EXT4_FREE_BLOCKS_FORGET; if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, count)) { @@ -1353,68 +1190,30 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, } } -void ext4_ind_truncate(struct inode *inode) +void ext4_ind_truncate(handle_t *handle, struct inode *inode) { - handle_t *handle; struct ext4_inode_info *ei = EXT4_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; __le32 nr = 0; int n = 0; ext4_lblk_t last_block, max_block; - loff_t page_len; unsigned blocksize = inode->i_sb->s_blocksize; - int err; - - handle = start_transaction(inode); - if (IS_ERR(handle)) - return; /* AKPM: return what? */ last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - if (inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out_stop; - } - if (last_block != max_block) { n = ext4_block_to_path(inode, last_block, offsets, NULL); if (n == 0) - goto out_stop; /* error */ + return; } - /* - * OK. This truncate is going to happen. We add the inode to the - * orphan list, so that if this truncate spans multiple transactions, - * and we crash, we will resume the truncate when the filesystem - * recovers. It also marks the inode dirty, to catch the new size. - * - * Implication: the file must always be in a sane, consistent - * truncatable state while each transaction commits. - */ - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - /* - * From here we block out all ext4_get_block() callers who want to - * modify the block allocation tree. - */ - down_write(&ei->i_data_sem); - - ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); /* @@ -1431,7 +1230,7 @@ void ext4_ind_truncate(struct inode *inode) * It is unnecessary to free any data blocks if last_block is * equal to the indirect block limit. */ - goto out_unlock; + return; } else if (n == 1) { /* direct blocks */ ext4_free_data(handle, inode, NULL, i_data+offsets[0], i_data + EXT4_NDIR_BLOCKS); @@ -1491,31 +1290,6 @@ do_indirects: case EXT4_TIND_BLOCK: ; } - -out_unlock: - up_write(&ei->i_data_sem); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - - /* - * In a multi-transaction truncate, we only make the final transaction - * synchronous - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); -out_stop: - /* - * If this was a simple ftruncate(), and the file will remain alive - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - - ext4_journal_stop(handle); - trace_ext4_truncate_exit(inode); } static int free_hole_blocks(handle_t *handle, struct inode *inode, @@ -1569,8 +1343,8 @@ err: return ret; } -static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t first, ext4_lblk_t stop) +int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t first, ext4_lblk_t stop) { int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); int level, ret = 0; @@ -1604,157 +1378,3 @@ err: return ret; } -int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length) -{ - struct inode *inode = file_inode(file); - struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - handle_t *handle = NULL; - loff_t first_page, last_page, page_len; - loff_t first_page_offset, last_page_offset; - int err = 0; - - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - err = filemap_write_and_wait_range(mapping, - offset, offset + length - 1); - if (err) - return err; - } - - mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - err = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { - err = -ETXTBSY; - goto out_mutex; - } - - /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) - goto out_mutex; - - /* - * If the hole extents beyond i_size, set the hole - * to end after the page that contains i_size - */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - - offset; - } - - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - last_page = (offset + length) >> PAGE_CACHE_SHIFT; - - first_page_offset = first_page << PAGE_CACHE_SHIFT; - last_page_offset = last_page << PAGE_CACHE_SHIFT; - - /* Now release the pages */ - if (last_page_offset > first_page_offset) { - truncate_pagecache_range(inode, first_page_offset, - last_page_offset - 1); - } - - /* Wait all existing dio works, newcomers will block on i_mutex */ - inode_dio_wait(inode); - - handle = start_transaction(inode); - if (IS_ERR(handle)) - goto out_mutex; - - /* - * Now we need to zero out the non-page-aligned data in the - * pages at the start and tail of the hole, and unmap the buffer - * heads for the block aligned regions of the page that were - * completely zerod. - */ - if (first_page > last_page) { - /* - * If the file space being truncated is contained within a page - * just zero out and unmap the middle of that page - */ - err = ext4_discard_partial_page_buffers(handle, - mapping, offset, length, 0); - if (err) - goto out; - } else { - /* - * Zero out and unmap the paritial page that contains - * the start of the hole - */ - page_len = first_page_offset - offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - offset, page_len, 0); - if (err) - goto out; - } - - /* - * Zero out and unmap the partial page that contains - * the end of the hole - */ - page_len = offset + length - last_page_offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - last_page_offset, page_len, 0); - if (err) - goto out; - } - } - - /* - * If i_size contained in the last page, we need to - * unmap and zero the paritial page after i_size - */ - if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && - inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - if (err) - goto out; - } - } - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - - if (first_block >= stop_block) - goto out; - - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); - - err = ext4_es_remove_extent(inode, first_block, - stop_block - first_block); - err = ext4_free_hole_blocks(handle, inode, first_block, stop_block); - - ext4_discard_preallocations(inode); - - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - up_write(&EXT4_I(inode)->i_data_sem); - -out: - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - -out_mutex: - mutex_unlock(&inode->i_mutex); - - return err; -} diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index c0fd1a123f7d..d9ecbf1113a7 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -19,7 +19,8 @@ #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) -#define EXT4_INLINE_DOTDOT_SIZE 4 +#define EXT4_INLINE_DOTDOT_OFFSET 2 +#define EXT4_INLINE_DOTDOT_SIZE 4 int ext4_get_inline_size(struct inode *inode) { @@ -71,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode, entry = (struct ext4_xattr_entry *) ((void *)raw_inode + EXT4_I(inode)->i_inline_off); - free += le32_to_cpu(entry->e_value_size); + free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); goto out; } @@ -1289,19 +1290,133 @@ out: return ret; } -int ext4_read_inline_dir(struct file *filp, - void *dirent, filldir_t filldir, +/* + * This function fills a red-black tree with information from an + * inlined dir. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +int htree_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data) +{ + int err = 0, count = 0; + unsigned int parent_ino; + int pos; + struct ext4_dir_entry_2 *de; + struct inode *inode = file_inode(dir_file); + int ret, inline_size = 0; + struct ext4_iloc iloc; + void *dir_buf = NULL; + struct ext4_dir_entry_2 fake; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + *has_inline_data = 0; + goto out; + } + + inline_size = ext4_get_inline_size(inode); + dir_buf = kmalloc(inline_size, GFP_NOFS); + if (!dir_buf) { + ret = -ENOMEM; + up_read(&EXT4_I(inode)->xattr_sem); + goto out; + } + + ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + if (ret < 0) + goto out; + + pos = 0; + parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + while (pos < inline_size) { + /* + * As inlined dir doesn't store any information about '.' and + * only the inode number of '..' is stored, we have to handle + * them differently. + */ + if (pos == 0) { + fake.inode = cpu_to_le32(inode->i_ino); + fake.name_len = 1; + strcpy(fake.name, "."); + fake.rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(fake.name_len), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; + pos = EXT4_INLINE_DOTDOT_OFFSET; + } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { + fake.inode = cpu_to_le32(parent_ino); + fake.name_len = 2; + strcpy(fake.name, ".."); + fake.rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(fake.name_len), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; + pos = EXT4_INLINE_DOTDOT_SIZE; + } else { + de = (struct ext4_dir_entry_2 *)(dir_buf + pos); + pos += ext4_rec_len_from_disk(de->rec_len, inline_size); + if (ext4_check_dir_entry(inode, dir_file, de, + iloc.bh, dir_buf, + inline_size, pos)) { + ret = count; + goto out; + } + } + + ext4fs_dirhash(de->name, de->name_len, hinfo); + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de); + if (err) { + count = err; + goto out; + } + count++; + } + ret = count; +out: + kfree(dir_buf); + brelse(iloc.bh); + return ret; +} + +/* + * So this function is called when the volume is mkfsed with + * dir_index disabled. In order to keep f_pos persistent + * after we convert from an inlined dir to a blocked based, + * we just pretend that we are a normal dir and return the + * offset as if '.' and '..' really take place. + * + */ +int ext4_read_inline_dir(struct file *file, + struct dir_context *ctx, int *has_inline_data) { - int error = 0; unsigned int offset, parent_ino; - int i, stored; + int i; struct ext4_dir_entry_2 *de; struct super_block *sb; - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; + int dotdot_offset, dotdot_size, extra_offset, extra_size; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -1328,96 +1443,89 @@ int ext4_read_inline_dir(struct file *filp, goto out; sb = inode->i_sb; - stored = 0; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + offset = ctx->pos; - while (!error && !stored && filp->f_pos < inode->i_size) { -revalidate: - /* - * If the version has changed since the last call to - * readdir(2), then we might be pointing to an invalid - * dirent right now. Scan from the start of the inline - * dir to make sure. - */ - if (filp->f_version != inode->i_version) { - for (i = 0; - i < inode->i_size && i < offset;) { - if (!i) { - /* skip "." and ".." if needed. */ - i += EXT4_INLINE_DOTDOT_SIZE; - continue; - } - de = (struct ext4_dir_entry_2 *) - (dir_buf + i); - /* It's too expensive to do a full - * dirent test each time round this - * loop, but we do have to test at - * least that it is non-zero. A - * failure will be detected in the - * dirent test below. */ - if (ext4_rec_len_from_disk(de->rec_len, - inline_size) < EXT4_DIR_REC_LEN(1)) - break; - i += ext4_rec_len_from_disk(de->rec_len, - inline_size); - } - offset = i; - filp->f_pos = offset; - filp->f_version = inode->i_version; - } + /* + * dotdot_offset and dotdot_size is the real offset and + * size for ".." and "." if the dir is block based while + * the real size for them are only EXT4_INLINE_DOTDOT_SIZE. + * So we will use extra_offset and extra_size to indicate them + * during the inline dir iteration. + */ + dotdot_offset = EXT4_DIR_REC_LEN(1); + dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); + extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; + extra_size = extra_offset + inline_size; - while (!error && filp->f_pos < inode->i_size) { - if (filp->f_pos == 0) { - error = filldir(dirent, ".", 1, 0, inode->i_ino, - DT_DIR); - if (error) - break; - stored++; - - error = filldir(dirent, "..", 2, 0, parent_ino, - DT_DIR); - if (error) - break; - stored++; - - filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE; + /* + * If the version has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the inline + * dir to make sure. + */ + if (file->f_version != inode->i_version) { + for (i = 0; i < extra_size && i < offset;) { + /* + * "." is with offset 0 and + * ".." is dotdot_offset. + */ + if (!i) { + i = dotdot_offset; + continue; + } else if (i == dotdot_offset) { + i = dotdot_size; continue; } + /* for other entry, the real offset in + * the buf has to be tuned accordingly. + */ + de = (struct ext4_dir_entry_2 *) + (dir_buf + i - extra_offset); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, extra_size) + < EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + extra_size); + } + offset = i; + ctx->pos = offset; + file->f_version = inode->i_version; + } - de = (struct ext4_dir_entry_2 *)(dir_buf + offset); - if (ext4_check_dir_entry(inode, filp, de, - iloc.bh, dir_buf, - inline_size, offset)) { - ret = stored; + while (ctx->pos < extra_size) { + if (ctx->pos == 0) { + if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR)) + goto out; + ctx->pos = dotdot_offset; + continue; + } + + if (ctx->pos == dotdot_offset) { + if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR)) + goto out; + ctx->pos = dotdot_size; + continue; + } + + de = (struct ext4_dir_entry_2 *) + (dir_buf + ctx->pos - extra_offset); + if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf, + extra_size, ctx->pos)) + goto out; + if (le32_to_cpu(de->inode)) { + if (!dir_emit(ctx, de->name, de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) goto out; - } - offset += ext4_rec_len_from_disk(de->rec_len, - inline_size); - if (le32_to_cpu(de->inode)) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. - */ - u64 version = filp->f_version; - - error = filldir(dirent, de->name, - de->name_len, - filp->f_pos, - le32_to_cpu(de->inode), - get_dtype(sb, de->file_type)); - if (error) - break; - if (version != filp->f_version) - goto revalidate; - stored++; - } - filp->f_pos += ext4_rec_len_from_disk(de->rec_len, - inline_size); } - offset = 0; + ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size); } out: kfree(dir_buf); @@ -1702,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode, if (error) goto out; - physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; + physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; physical += offsetof(struct ext4_inode, i_block); length = i_size_read(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b3a5213bc73e..0188e65e1f58 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,6 +37,7 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/ratelimit.h> +#include <linux/aio.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -55,21 +56,21 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, __u16 csum_hi = 0; __u32 csum; - csum_lo = raw->i_checksum_lo; + csum_lo = le16_to_cpu(raw->i_checksum_lo); raw->i_checksum_lo = 0; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum_hi = raw->i_checksum_hi; + csum_hi = le16_to_cpu(raw->i_checksum_hi); raw->i_checksum_hi = 0; } csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, EXT4_INODE_SIZE(inode->i_sb)); - raw->i_checksum_lo = csum_lo; + raw->i_checksum_lo = cpu_to_le16(csum_lo); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) - raw->i_checksum_hi = csum_hi; + raw->i_checksum_hi = cpu_to_le16(csum_hi); return csum; } @@ -131,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static void ext4_invalidatepage(struct page *page, unsigned long offset); +static void ext4_invalidatepage(struct page *page, unsigned int offset, + unsigned int length); static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); -static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, - struct inode *inode, struct page *page, loff_t from, - loff_t length, int flags); +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents); /* * Test whether an inode is a fast symlink. @@ -210,12 +211,12 @@ void ext4_evict_inode(struct inode *inode) journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; - jbd2_log_start_commit(journal, commit_tid); - jbd2_log_wait_commit(journal, commit_tid); + jbd2_complete_transaction(journal, commit_tid); filemap_write_and_wait(&inode->i_data); } truncate_inode_pages(&inode->i_data, 0); - ext4_ioend_shutdown(inode); + + WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); goto no_delete; } @@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode) if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); - ext4_ioend_shutdown(inode); + WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); if (is_bad_inode(inode)) goto no_delete; @@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func, #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) -/* - * Return the number of contiguous dirty pages in a given inode - * starting at page frame idx. - */ -static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, - unsigned int max_pages) -{ - struct address_space *mapping = inode->i_mapping; - pgoff_t index; - struct pagevec pvec; - pgoff_t num = 0; - int i, nr_pages, done = 0; - - if (max_pages == 0) - return 0; - pagevec_init(&pvec, 0); - while (!done) { - index = idx; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - (pgoff_t)PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; - - lock_page(page); - if (unlikely(page->mapping != mapping) || - !PageDirty(page) || - PageWriteback(page) || - page->index != idx) { - done = 1; - unlock_page(page); - break; - } - if (page_has_buffers(page)) { - bh = head = page_buffers(page); - do { - if (!buffer_delay(bh) && - !buffer_unwritten(bh)) - done = 1; - bh = bh->b_this_page; - } while (!done && (bh != head)); - } - unlock_page(page); - if (done) - break; - idx++; - num++; - if (num >= max_pages) { - done = 1; - break; - } - } - pagevec_release(&pvec); - } - return num; -} - #ifdef ES_AGGRESSIVE_TEST static void ext4_map_blocks_es_recheck(handle_t *handle, struct inode *inode, @@ -573,6 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, "logical block %lu\n", inode->i_ino, flags, map->m_len, (unsigned long) map->m_lblk); + ext4_es_lru_add(inode); + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { @@ -1081,31 +1024,56 @@ retry_journal: /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct buffer_head *bh) { + int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); - return ext4_handle_dirty_metadata(handle, NULL, bh); + ret = ext4_handle_dirty_metadata(handle, NULL, bh); + clear_buffer_meta(bh); + clear_buffer_prio(bh); + return ret; } -static int ext4_generic_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from page_symlink(). + * + * ext4 never places buffers on inode->i_mapping->private_list. metadata + * buffers are managed internally. + */ +static int ext4_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { - int i_size_changed = 0; - struct inode *inode = mapping->host; handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + int ret = 0, ret2; + int i_size_changed = 0; - if (ext4_has_inline_data(inode)) - copied = ext4_write_inline_data_end(inode, pos, len, - copied, page); - else + trace_ext4_write_end(inode, pos, len, copied); + if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { + ret = ext4_jbd2_file_inode(handle, inode); + if (ret) { + unlock_page(page); + page_cache_release(page); + goto errout; + } + } + + if (ext4_has_inline_data(inode)) { + ret = ext4_write_inline_data_end(inode, pos, len, + copied, page); + if (ret < 0) + goto errout; + copied = ret; + } else copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. + * cannot change under us because we hole i_mutex. * * But it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1115,10 +1083,10 @@ static int ext4_generic_write_end(struct file *file, i_size_changed = 1; } - if (pos + copied > EXT4_I(inode)->i_disksize) { + if (pos + copied > EXT4_I(inode)->i_disksize) { /* We need to mark inode dirty even if * new_i_size is less that inode->i_size - * bu greater than i_disksize.(hint delalloc) + * but greater than i_disksize. (hint delalloc) */ ext4_update_i_disksize(inode, (pos + copied)); i_size_changed = 1; @@ -1135,87 +1103,13 @@ static int ext4_generic_write_end(struct file *file, if (i_size_changed) ext4_mark_inode_dirty(handle, inode); - return copied; -} - -/* - * We need to pick up the new inode size which generic_commit_write gave us - * `file' can be NULL - eg, when called from page_symlink(). - * - * ext4 never places buffers on inode->i_mapping->private_list. metadata - * buffers are managed internally. - */ -static int ext4_ordered_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = mapping->host; - int ret = 0, ret2; - - trace_ext4_ordered_write_end(inode, pos, len, copied); - ret = ext4_jbd2_file_inode(handle, inode); - - if (ret == 0) { - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; - if (pos + len > inode->i_size && ext4_can_truncate(inode)) - /* if we have allocated more blocks and copied - * less. We will have blocks allocated outside - * inode->i_size. So truncate them - */ - ext4_orphan_add(handle, inode); - if (ret2 < 0) - ret = ret2; - } else { - unlock_page(page); - page_cache_release(page); - } - - ret2 = ext4_journal_stop(handle); - if (!ret) - ret = ret2; - - if (pos + len > inode->i_size) { - ext4_truncate_failed_write(inode); - /* - * If truncate failed early the inode might still be - * on the orphan list; we need to make sure the inode - * is removed from the orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - - - return ret ? ret : copied; -} - -static int ext4_writeback_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = mapping->host; - int ret = 0, ret2; - - trace_ext4_writeback_write_end(inode, pos, len, copied); - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); - - if (ret2 < 0) - ret = ret2; - +errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -1465,21 +1359,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free) } static void ext4_da_page_release_reservation(struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { int to_release = 0; struct buffer_head *head, *bh; unsigned int curr_off = 0; struct inode *inode = page->mapping->host; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int stop = offset + length; int num_clusters; ext4_fsblk_t lblk; + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + head = page_buffers(page); bh = head; do { unsigned int next_off = curr_off + bh->b_size; + if (next_off > stop) + break; + if ((offset <= curr_off) && (buffer_delay(bh))) { to_release++; clear_buffer_delay(bh); @@ -1510,140 +1411,43 @@ static void ext4_da_page_release_reservation(struct page *page, * Delayed allocation stuff */ -/* - * mpage_da_submit_io - walks through extent of pages and try to write - * them with writepage() call back - * - * @mpd->inode: inode - * @mpd->first_page: first page of the extent - * @mpd->next_page: page after the last page of the extent - * - * By the time mpage_da_submit_io() is called we expect all blocks - * to be allocated. this may be wrong if allocation failed. - * - * As pages are already locked by write_cache_pages(), we can't use it - */ -static int mpage_da_submit_io(struct mpage_da_data *mpd, - struct ext4_map_blocks *map) -{ - struct pagevec pvec; - unsigned long index, end; - int ret = 0, err, nr_pages, i; - struct inode *inode = mpd->inode; - struct address_space *mapping = inode->i_mapping; - loff_t size = i_size_read(inode); - unsigned int len, block_start; - struct buffer_head *bh, *page_bufs = NULL; - sector_t pblock = 0, cur_logical = 0; - struct ext4_io_submit io_submit; +struct mpage_da_data { + struct inode *inode; + struct writeback_control *wbc; - BUG_ON(mpd->next_page <= mpd->first_page); - memset(&io_submit, 0, sizeof(io_submit)); + pgoff_t first_page; /* The first page to write */ + pgoff_t next_page; /* Current page to examine */ + pgoff_t last_page; /* Last page to examine */ /* - * We need to start from the first_page to the next_page - 1 - * to make sure we also write the mapped dirty buffer_heads. - * If we look at mpd->b_blocknr we would only be looking - * at the currently mapped buffer_heads. + * Extent to map - this can be after first_page because that can be + * fully mapped. We somewhat abuse m_flags to store whether the extent + * is delalloc or unwritten. */ - index = mpd->first_page; - end = mpd->next_page - 1; - - pagevec_init(&pvec, 0); - while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - int skip_page = 0; - struct page *page = pvec.pages[i]; - - index = page->index; - if (index > end) - break; - - if (index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - if (map) { - cur_logical = index << (PAGE_CACHE_SHIFT - - inode->i_blkbits); - pblock = map->m_pblk + (cur_logical - - map->m_lblk); - } - index++; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - - bh = page_bufs = page_buffers(page); - block_start = 0; - do { - if (map && (cur_logical >= map->m_lblk) && - (cur_logical <= (map->m_lblk + - (map->m_len - 1)))) { - if (buffer_delay(bh)) { - clear_buffer_delay(bh); - bh->b_blocknr = pblock; - } - if (buffer_unwritten(bh) || - buffer_mapped(bh)) - BUG_ON(bh->b_blocknr != pblock); - if (map->m_flags & EXT4_MAP_UNINIT) - set_buffer_uninit(bh); - clear_buffer_unwritten(bh); - } - - /* - * skip page if block allocation undone and - * block is dirty - */ - if (ext4_bh_delay_or_unwritten(NULL, bh)) - skip_page = 1; - bh = bh->b_this_page; - block_start += bh->b_size; - cur_logical++; - pblock++; - } while (bh != page_bufs); - - if (skip_page) { - unlock_page(page); - continue; - } - - clear_page_dirty_for_io(page); - err = ext4_bio_write_page(&io_submit, page, len, - mpd->wbc); - if (!err) - mpd->pages_written++; - /* - * In error case, we have to continue because - * remaining pages are still locked - */ - if (ret == 0) - ret = err; - } - pagevec_release(&pvec); - } - ext4_io_submit(&io_submit); - return ret; -} + struct ext4_map_blocks map; + struct ext4_io_submit io_submit; /* IO submission data */ +}; -static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) +static void mpage_release_unused_pages(struct mpage_da_data *mpd, + bool invalidate) { int nr_pages, i; pgoff_t index, end; struct pagevec pvec; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; - ext4_lblk_t start, last; + + /* This is necessary when next_page == 0. */ + if (mpd->first_page >= mpd->next_page) + return; index = mpd->first_page; end = mpd->next_page - 1; - - start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); - ext4_es_remove_extent(inode, start, last - start + 1); + if (invalidate) { + ext4_lblk_t start, last; + start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); + ext4_es_remove_extent(inode, start, last - start + 1); + } pagevec_init(&pvec, 0); while (index <= end) { @@ -1656,236 +1460,40 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) break; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - block_invalidatepage(page, 0); - ClearPageUptodate(page); + if (invalidate) { + block_invalidatepage(page, 0, PAGE_CACHE_SIZE); + ClearPageUptodate(page); + } unlock_page(page); } index = pvec.pages[nr_pages - 1]->index + 1; pagevec_release(&pvec); } - return; } static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), - ext4_count_free_clusters(inode->i_sb))); + ext4_count_free_clusters(sb))); ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", - (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", - (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); ext4_msg(sb, KERN_CRIT, "Block reservation details"); ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", - EXT4_I(inode)->i_reserved_data_blocks); + ei->i_reserved_data_blocks); ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", - EXT4_I(inode)->i_reserved_meta_blocks); - return; -} - -/* - * mpage_da_map_and_submit - go through given space, map them - * if necessary, and then submit them for I/O - * - * @mpd - bh describing space - * - * The function skips space we know is already mapped to disk blocks. - * - */ -static void mpage_da_map_and_submit(struct mpage_da_data *mpd) -{ - int err, blks, get_blocks_flags; - struct ext4_map_blocks map, *mapp = NULL; - sector_t next = mpd->b_blocknr; - unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; - loff_t disksize = EXT4_I(mpd->inode)->i_disksize; - handle_t *handle = NULL; - - /* - * If the blocks are mapped already, or we couldn't accumulate - * any blocks, then proceed immediately to the submission stage. - */ - if ((mpd->b_size == 0) || - ((mpd->b_state & (1 << BH_Mapped)) && - !(mpd->b_state & (1 << BH_Delay)) && - !(mpd->b_state & (1 << BH_Unwritten)))) - goto submit_io; - - handle = ext4_journal_current_handle(); - BUG_ON(!handle); - - /* - * Call ext4_map_blocks() to allocate any delayed allocation - * blocks, or to convert an uninitialized extent to be - * initialized (in the case where we have written into - * one or more preallocated blocks). - * - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to - * indicate that we are on the delayed allocation path. This - * affects functions in many different parts of the allocation - * call path. This flag exists primarily because we don't - * want to change *many* call functions, so ext4_map_blocks() - * will set the EXT4_STATE_DELALLOC_RESERVED flag once the - * inode's allocation semaphore is taken. - * - * If the blocks in questions were delalloc blocks, set - * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting - * variables are updated after the blocks have been allocated. - */ - map.m_lblk = next; - map.m_len = max_blocks; - get_blocks_flags = EXT4_GET_BLOCKS_CREATE; - if (ext4_should_dioread_nolock(mpd->inode)) - get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (mpd->b_state & (1 << BH_Delay)) - get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; - - blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); - if (blks < 0) { - struct super_block *sb = mpd->inode->i_sb; - - err = blks; - /* - * If get block returns EAGAIN or ENOSPC and there - * appears to be free blocks we will just let - * mpage_da_submit_io() unlock all of the pages. - */ - if (err == -EAGAIN) - goto submit_io; - - if (err == -ENOSPC && ext4_count_free_clusters(sb)) { - mpd->retval = err; - goto submit_io; - } - - /* - * get block failure will cause us to loop in - * writepages, because a_ops->writepage won't be able - * to make progress. The page will be redirtied by - * writepage and writepages will again try to write - * the same. - */ - if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { - ext4_msg(sb, KERN_CRIT, - "delayed block allocation failed for inode %lu " - "at logical offset %llu with max blocks %zd " - "with error %d", mpd->inode->i_ino, - (unsigned long long) next, - mpd->b_size >> mpd->inode->i_blkbits, err); - ext4_msg(sb, KERN_CRIT, - "This should not happen!! Data will be lost"); - if (err == -ENOSPC) - ext4_print_free_blocks(mpd->inode); - } - /* invalidate all the pages */ - ext4_da_block_invalidatepages(mpd); - - /* Mark this page range as having been completed */ - mpd->io_done = 1; - return; - } - BUG_ON(blks == 0); - - mapp = ↦ - if (map.m_flags & EXT4_MAP_NEW) { - struct block_device *bdev = mpd->inode->i_sb->s_bdev; - int i; - - for (i = 0; i < map.m_len; i++) - unmap_underlying_metadata(bdev, map.m_pblk + i); - } - - /* - * Update on-disk size along with block allocation. - */ - disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; - if (disksize > i_size_read(mpd->inode)) - disksize = i_size_read(mpd->inode); - if (disksize > EXT4_I(mpd->inode)->i_disksize) { - ext4_update_i_disksize(mpd->inode, disksize); - err = ext4_mark_inode_dirty(handle, mpd->inode); - if (err) - ext4_error(mpd->inode->i_sb, - "Failed to mark inode %lu dirty", - mpd->inode->i_ino); - } - -submit_io: - mpage_da_submit_io(mpd, mapp); - mpd->io_done = 1; -} - -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ - (1 << BH_Delay) | (1 << BH_Unwritten)) - -/* - * mpage_add_bh_to_extent - try to add one more block to extent of blocks - * - * @mpd->lbh - extent of blocks - * @logical - logical number of the block in the file - * @b_state - b_state of the buffer head added - * - * the function is used to collect contig. blocks in same state - */ -static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical, - unsigned long b_state) -{ - sector_t next; - int blkbits = mpd->inode->i_blkbits; - int nrblocks = mpd->b_size >> blkbits; - - /* - * XXX Don't go larger than mballoc is willing to allocate - * This is a stopgap solution. We eventually need to fold - * mpage_da_submit_io() into this function and then call - * ext4_map_blocks() multiple times in a loop - */ - if (nrblocks >= (8*1024*1024 >> blkbits)) - goto flush_it; - - /* check if the reserved journal credits might overflow */ - if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) { - if (nrblocks >= EXT4_MAX_TRANS_DATA) { - /* - * With non-extent format we are limited by the journal - * credit available. Total credit needed to insert - * nrblocks contiguous blocks is dependent on the - * nrblocks. So limit nrblocks. - */ - goto flush_it; - } - } - /* - * First block in the extent - */ - if (mpd->b_size == 0) { - mpd->b_blocknr = logical; - mpd->b_size = 1 << blkbits; - mpd->b_state = b_state & BH_FLAGS; - return; - } - - next = mpd->b_blocknr + nrblocks; - /* - * Can we merge the block to our big extent? - */ - if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { - mpd->b_size += 1 << blkbits; - return; - } - -flush_it: - /* - * We couldn't merge the block to our extent, so we - * need to flush current extent and start new one - */ - mpage_da_map_and_submit(mpd); + ei->i_reserved_meta_blocks); + ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u", + ei->i_allocated_meta_blocks); return; } @@ -1921,6 +1529,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, "logical block %lu\n", inode->i_ino, map->m_len, (unsigned long) map->m_lblk); + ext4_es_lru_add(inode); + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, iblock, &es)) { @@ -2194,7 +1804,7 @@ out: * lock so we have to do some magic. * * This function can get called via... - * - ext4_da_writepages after taking page lock (have journal handle) + * - ext4_writepages after taking page lock (have journal handle) * - journal_submit_inode_data_buffers (no journal handle) * - shrink_page_list via the kswapd/direct reclaim (no journal handle) * - grab_page_cache when doing write_begin (have journal handle) @@ -2272,76 +1882,405 @@ static int ext4_writepage(struct page *page, */ return __ext4_journalled_writepage(page, len); - memset(&io_submit, 0, sizeof(io_submit)); + ext4_io_submit_init(&io_submit, wbc); + io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_submit.io_end) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return -ENOMEM; + } ret = ext4_bio_write_page(&io_submit, page, len, wbc); ext4_io_submit(&io_submit); + /* Drop io_end reference we got from init */ + ext4_put_io_end_defer(io_submit.io_end); return ret; } +#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) + /* - * This is called via ext4_da_writepages() to - * calculate the total number of credits to reserve to fit - * a single extent allocation into a single transaction, - * ext4_da_writpeages() will loop calling this before - * the block allocation. + * mballoc gives us at most this number of blocks... + * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). + * The rest of mballoc seems to handle chunks upto full group size. */ +#define MAX_WRITEPAGES_EXTENT_LEN 2048 -static int ext4_da_writepages_trans_blocks(struct inode *inode) +/* + * mpage_add_bh_to_extent - try to add bh to extent of blocks to map + * + * @mpd - extent of blocks + * @lblk - logical number of the block in the file + * @b_state - b_state of the buffer head added + * + * the function is used to collect contig. blocks in same state + */ +static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, + unsigned long b_state) +{ + struct ext4_map_blocks *map = &mpd->map; + + /* Don't go larger than mballoc is willing to allocate */ + if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) + return 0; + + /* First block in the extent? */ + if (map->m_len == 0) { + map->m_lblk = lblk; + map->m_len = 1; + map->m_flags = b_state & BH_FLAGS; + return 1; + } + + /* Can we merge the block to our big extent? */ + if (lblk == map->m_lblk + map->m_len && + (b_state & BH_FLAGS) == map->m_flags) { + map->m_len++; + return 1; + } + return 0; +} + +static bool add_page_bufs_to_extent(struct mpage_da_data *mpd, + struct buffer_head *head, + struct buffer_head *bh, + ext4_lblk_t lblk) +{ + struct inode *inode = mpd->inode; + ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) + >> inode->i_blkbits; + + do { + BUG_ON(buffer_locked(bh)); + + if (!buffer_dirty(bh) || !buffer_mapped(bh) || + (!buffer_delay(bh) && !buffer_unwritten(bh)) || + lblk >= blocks) { + /* Found extent to map? */ + if (mpd->map.m_len) + return false; + if (lblk >= blocks) + return true; + continue; + } + if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state)) + return false; + } while (lblk++, (bh = bh->b_this_page) != head); + return true; +} + +static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) +{ + int len; + loff_t size = i_size_read(mpd->inode); + int err; + + BUG_ON(page->index != mpd->first_page); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + clear_page_dirty_for_io(page); + err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); + if (!err) + mpd->wbc->nr_to_write--; + mpd->first_page++; + + return err; +} + +/* + * mpage_map_buffers - update buffers corresponding to changed extent and + * submit fully mapped pages for IO + * + * @mpd - description of extent to map, on return next extent to map + * + * Scan buffers corresponding to changed extent (we expect corresponding pages + * to be already locked) and update buffer state according to new extent state. + * We map delalloc buffers to their physical location, clear unwritten bits, + * and mark buffers as uninit when we perform writes to uninitialized extents + * and do extent conversion after IO is finished. If the last page is not fully + * mapped, we update @map to the next extent in the last page that needs + * mapping. Otherwise we submit the page for IO. + */ +static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) +{ + struct pagevec pvec; + int nr_pages, i; + struct inode *inode = mpd->inode; + struct buffer_head *head, *bh; + int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits; + ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) + >> inode->i_blkbits; + pgoff_t start, end; + ext4_lblk_t lblk; + sector_t pblock; + int err; + + start = mpd->map.m_lblk >> bpp_bits; + end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; + lblk = start << bpp_bits; + pblock = mpd->map.m_pblk; + + pagevec_init(&pvec, 0); + while (start <= end) { + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, + PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (page->index > end) + break; + /* Upto 'end' pages must be contiguous */ + BUG_ON(page->index != start); + bh = head = page_buffers(page); + do { + if (lblk < mpd->map.m_lblk) + continue; + if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { + /* + * Buffer after end of mapped extent. + * Find next buffer in the page to map. + */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + add_page_bufs_to_extent(mpd, head, bh, + lblk); + pagevec_release(&pvec); + return 0; + } + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock++; + } + clear_buffer_unwritten(bh); + } while (++lblk < blocks && + (bh = bh->b_this_page) != head); + + /* + * FIXME: This is going to break if dioread_nolock + * supports blocksize < pagesize as we will try to + * convert potentially unmapped parts of inode. + */ + mpd->io_submit.io_end->size += PAGE_CACHE_SIZE; + /* Page fully mapped - let IO run! */ + err = mpage_submit_page(mpd, page); + if (err < 0) { + pagevec_release(&pvec); + return err; + } + start++; + } + pagevec_release(&pvec); + } + /* Extent fully mapped and matches with page boundary. We are done. */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + return 0; +} + +static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) { - int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; + struct inode *inode = mpd->inode; + struct ext4_map_blocks *map = &mpd->map; + int get_blocks_flags; + int err; + trace_ext4_da_write_pages_extent(inode, map); /* - * With non-extent format the journal credit needed to - * insert nrblocks contiguous block is dependent on - * number of contiguous block. So we will limit - * number of contiguous block to a sane value + * Call ext4_map_blocks() to allocate any delayed allocation blocks, or + * to convert an uninitialized extent to be initialized (in the case + * where we have written into one or more preallocated blocks). It is + * possible that we're going to need more metadata blocks than + * previously reserved. However we must not fail because we're in + * writeback and there is nothing we can do about it so it might result + * in data loss. So use reserved blocks to allocate metadata if + * possible. + * + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks + * in question are delalloc blocks. This affects functions in many + * different parts of the allocation call path. This flag exists + * primarily because we don't want to change *many* call functions, so + * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag + * once the inode's allocation semaphore is taken. */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && - (max_blocks > EXT4_MAX_TRANS_DATA)) - max_blocks = EXT4_MAX_TRANS_DATA; + get_blocks_flags = EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_METADATA_NOFAIL; + if (ext4_should_dioread_nolock(inode)) + get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; + if (map->m_flags & (1 << BH_Delay)) + get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + + err = ext4_map_blocks(handle, inode, map, get_blocks_flags); + if (err < 0) + return err; + if (map->m_flags & EXT4_MAP_UNINIT) { + if (!mpd->io_submit.io_end->handle && + ext4_handle_valid(handle)) { + mpd->io_submit.io_end->handle = handle->h_rsv_handle; + handle->h_rsv_handle = NULL; + } + ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); + } - return ext4_chunk_trans_blocks(inode, max_blocks); + BUG_ON(map->m_len == 0); + if (map->m_flags & EXT4_MAP_NEW) { + struct block_device *bdev = inode->i_sb->s_bdev; + int i; + + for (i = 0; i < map->m_len; i++) + unmap_underlying_metadata(bdev, map->m_pblk + i); + } + return 0; } /* - * write_cache_pages_da - walk the list of dirty pages of the given - * address space and accumulate pages that need writing, and call - * mpage_da_map_and_submit to map a single contiguous memory region - * and then write them. + * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length + * mpd->len and submit pages underlying it for IO + * + * @handle - handle for journal operations + * @mpd - extent to map + * + * The function maps extent starting at mpd->lblk of length mpd->len. If it is + * delayed, blocks are allocated, if it is unwritten, we may need to convert + * them to initialized or split the described range from larger unwritten + * extent. Note that we need not map all the described range since allocation + * can return less blocks or the range is covered by more unwritten extents. We + * cannot map more because we are limited by reserved transaction credits. On + * the other hand we always make sure that the last touched page is fully + * mapped so that it can be written out (and thus forward progress is + * guaranteed). After mapping we submit all mapped pages for IO. */ -static int write_cache_pages_da(handle_t *handle, - struct address_space *mapping, - struct writeback_control *wbc, - struct mpage_da_data *mpd, - pgoff_t *done_index) +static int mpage_map_and_submit_extent(handle_t *handle, + struct mpage_da_data *mpd, + bool *give_up_on_write) { - struct buffer_head *bh, *head; - struct inode *inode = mapping->host; - struct pagevec pvec; - unsigned int nr_pages; - sector_t logical; - pgoff_t index, end; - long nr_to_write = wbc->nr_to_write; - int i, tag, ret = 0; - - memset(mpd, 0, sizeof(struct mpage_da_data)); - mpd->wbc = wbc; - mpd->inode = inode; - pagevec_init(&pvec, 0); - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + struct inode *inode = mpd->inode; + struct ext4_map_blocks *map = &mpd->map; + int err; + loff_t disksize; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + mpd->io_submit.io_end->offset = + ((loff_t)map->m_lblk) << inode->i_blkbits; + while (map->m_len) { + err = mpage_map_one_extent(handle, mpd); + if (err < 0) { + struct super_block *sb = inode->i_sb; + + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + goto invalidate_dirty_pages; + /* + * Let the uper layers retry transient errors. + * In the case of ENOSPC, if ext4_count_free_blocks() + * is non-zero, a commit should free up blocks. + */ + if ((err == -ENOMEM) || + (err == -ENOSPC && ext4_count_free_clusters(sb))) + return err; + ext4_msg(sb, KERN_CRIT, + "Delayed block allocation failed for " + "inode %lu at logical offset %llu with" + " max blocks %u with error %d", + inode->i_ino, + (unsigned long long)map->m_lblk, + (unsigned)map->m_len, -err); + ext4_msg(sb, KERN_CRIT, + "This should not happen!! Data will " + "be lost\n"); + if (err == -ENOSPC) + ext4_print_free_blocks(inode); + invalidate_dirty_pages: + *give_up_on_write = true; + return err; + } + /* + * Update buffer state, submit mapped pages, and get us new + * extent to map + */ + err = mpage_map_and_submit_buffers(mpd); + if (err < 0) + return err; + } + + /* Update on-disk size after IO is submitted */ + disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) { + int err2; + + ext4_update_i_disksize(inode, disksize); + err2 = ext4_mark_inode_dirty(handle, inode); + if (err2) + ext4_error(inode->i_sb, + "Failed to mark inode %lu dirty", + inode->i_ino); + if (!err) + err = err2; + } + return err; +} + +/* + * Calculate the total number of credits to reserve for one writepages + * iteration. This is called from ext4_writepages(). We map an extent of + * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping + * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + + * bpp - 1 blocks in bpp different extents. + */ +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ + int bpp = ext4_journal_blocks_per_page(inode); + + return ext4_meta_trans_blocks(inode, + MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); +} + +/* + * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages + * and underlying extent to map + * + * @mpd - where to look for pages + * + * Walk dirty pages in the mapping. If they are fully mapped, submit them for + * IO immediately. When we find a page which isn't mapped we start accumulating + * extent of buffers underlying these pages that needs mapping (formed by + * either delayed or unwritten buffers). We also lock the pages containing + * these buffers. The extent found is returned in @mpd structure (starting at + * mpd->lblk with length mpd->len blocks). + * + * Note that this function can attach bios to one io_end structure which are + * neither logically nor physically contiguous. Although it may seem as an + * unnecessary complication, it is actually inevitable in blocksize < pagesize + * case as we need to track IO to all buffers underlying a page in one io_end. + */ +static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) +{ + struct address_space *mapping = mpd->inode->i_mapping; + struct pagevec pvec; + unsigned int nr_pages; + pgoff_t index = mpd->first_page; + pgoff_t end = mpd->last_page; + int tag; + int i, err = 0; + int blkbits = mpd->inode->i_blkbits; + ext4_lblk_t lblk; + struct buffer_head *head; + + if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; - *done_index = index; + pagevec_init(&pvec, 0); + mpd->map.m_len = 0; + mpd->next_page = index; while (index <= end) { nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) - return 0; + goto out; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -2356,31 +2295,21 @@ static int write_cache_pages_da(handle_t *handle, if (page->index > end) goto out; - *done_index = page->index + 1; - - /* - * If we can't merge this page, and we have - * accumulated an contiguous region, write it - */ - if ((mpd->next_page != page->index) && - (mpd->next_page != mpd->first_page)) { - mpage_da_map_and_submit(mpd); - goto ret_extent_tail; - } + /* If we can't merge this page, we are done. */ + if (mpd->map.m_len > 0 && mpd->next_page != page->index) + goto out; lock_page(page); - /* - * If the page is no longer dirty, or its - * mapping no longer corresponds to inode we - * are writing (which means it has been - * truncated or invalidated), or the page is - * already under writeback and we are not - * doing a data integrity writeback, skip the page + * If the page is no longer dirty, or its mapping no + * longer corresponds to inode we are writing (which + * means it has been truncated or invalidated), or the + * page is already under writeback and we are not doing + * a data integrity writeback, skip the page */ if (!PageDirty(page) || (PageWriteback(page) && - (wbc->sync_mode == WB_SYNC_NONE)) || + (mpd->wbc->sync_mode == WB_SYNC_NONE)) || unlikely(page->mapping != mapping)) { unlock_page(page); continue; @@ -2389,106 +2318,70 @@ static int write_cache_pages_da(handle_t *handle, wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); - /* - * If we have inline data and arrive here, it means that - * we will soon create the block for the 1st page, so - * we'd better clear the inline data here. - */ - if (ext4_has_inline_data(inode)) { - BUG_ON(ext4_test_inode_state(inode, - EXT4_STATE_MAY_INLINE_DATA)); - ext4_destroy_inline_data(handle, inode); - } - - if (mpd->next_page != page->index) + if (mpd->map.m_len == 0) mpd->first_page = page->index; mpd->next_page = page->index + 1; - logical = (sector_t) page->index << - (PAGE_CACHE_SHIFT - inode->i_blkbits); - /* Add all dirty buffers to mpd */ + lblk = ((ext4_lblk_t)page->index) << + (PAGE_CACHE_SHIFT - blkbits); head = page_buffers(page); - bh = head; - do { - BUG_ON(buffer_locked(bh)); - /* - * We need to try to allocate unmapped blocks - * in the same page. Otherwise we won't make - * progress with the page in ext4_writepage - */ - if (ext4_bh_delay_or_unwritten(NULL, bh)) { - mpage_add_bh_to_extent(mpd, logical, - bh->b_state); - if (mpd->io_done) - goto ret_extent_tail; - } else if (buffer_dirty(bh) && - buffer_mapped(bh)) { - /* - * mapped dirty buffer. We need to - * update the b_state because we look - * at b_state in mpage_da_map_blocks. - * We don't update b_size because if we - * find an unmapped buffer_head later - * we need to use the b_state flag of - * that buffer_head. - */ - if (mpd->b_size == 0) - mpd->b_state = - bh->b_state & BH_FLAGS; - } - logical++; - } while ((bh = bh->b_this_page) != head); - - if (nr_to_write > 0) { - nr_to_write--; - if (nr_to_write == 0 && - wbc->sync_mode == WB_SYNC_NONE) - /* - * We stop writing back only if we are - * not doing integrity sync. In case of - * integrity sync we have to keep going - * because someone may be concurrently - * dirtying pages, and we might have - * synced a lot of newly appeared dirty - * pages, but have not synced all of the - * old dirty pages. - */ + if (!add_page_bufs_to_extent(mpd, head, head, lblk)) + goto out; + /* So far everything mapped? Submit the page for IO. */ + if (mpd->map.m_len == 0) { + err = mpage_submit_page(mpd, page); + if (err < 0) goto out; } + + /* + * Accumulated enough dirty pages? This doesn't apply + * to WB_SYNC_ALL mode. For integrity sync we have to + * keep going because someone may be concurrently + * dirtying pages, and we might have synced a lot of + * newly appeared dirty pages, but have not synced all + * of the old dirty pages. + */ + if (mpd->wbc->sync_mode == WB_SYNC_NONE && + mpd->next_page - mpd->first_page >= + mpd->wbc->nr_to_write) + goto out; } pagevec_release(&pvec); cond_resched(); } return 0; -ret_extent_tail: - ret = MPAGE_DA_EXTENT_TAIL; out: pagevec_release(&pvec); - cond_resched(); - return ret; + return err; } +static int __writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = ext4_writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} -static int ext4_da_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int ext4_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - pgoff_t index; + pgoff_t writeback_index = 0; + long nr_to_write = wbc->nr_to_write; int range_whole = 0; + int cycled = 1; handle_t *handle = NULL; struct mpage_da_data mpd; struct inode *inode = mapping->host; - int pages_written = 0; - unsigned int max_pages; - int range_cyclic, cycled = 1, io_done = 0; - int needed_blocks, ret = 0; - long desired_nr_to_write, nr_to_writebump = 0; - loff_t range_start = wbc->range_start; + int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); - pgoff_t done_index = 0; - pgoff_t end; + bool done; struct blk_plug plug; + bool give_up_on_write = false; - trace_ext4_da_writepages(inode, wbc); + trace_ext4_writepages(inode, wbc); /* * No pages to write? This is mainly a kludge to avoid starting @@ -2498,170 +2391,171 @@ static int ext4_da_writepages(struct address_space *mapping, if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; + if (ext4_should_journal_data(inode)) { + struct blk_plug plug; + int ret; + + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, __writepage, mapping); + blk_finish_plug(&plug); + return ret; + } + /* * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because * the latter could be true if the filesystem is mounted - * read-only, and in that case, ext4_da_writepages should + * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) return -EROFS; + if (ext4_should_dioread_nolock(inode)) { + /* + * We may need to convert upto one extent per block in + * the page and we may dirty the inode. + */ + rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits); + } + + /* + * If we have inline data and arrive here, it means that + * we will soon create the block for the 1st page, so + * we'd better clear the inline data here. + */ + if (ext4_has_inline_data(inode)) { + /* Just inode will be modified... */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_writepages; + } + BUG_ON(ext4_test_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA)); + ext4_destroy_inline_data(handle, inode); + ext4_journal_stop(handle); + } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - range_cyclic = wbc->range_cyclic; if (wbc->range_cyclic) { - index = mapping->writeback_index; - if (index) + writeback_index = mapping->writeback_index; + if (writeback_index) cycled = 0; - wbc->range_start = index << PAGE_CACHE_SHIFT; - wbc->range_end = LLONG_MAX; - wbc->range_cyclic = 0; - end = -1; + mpd.first_page = writeback_index; + mpd.last_page = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - } - - /* - * This works around two forms of stupidity. The first is in - * the writeback code, which caps the maximum number of pages - * written to be 1024 pages. This is wrong on multiple - * levels; different architectues have a different page size, - * which changes the maximum amount of data which gets - * written. Secondly, 4 megabytes is way too small. XFS - * forces this value to be 16 megabytes by multiplying - * nr_to_write parameter by four, and then relies on its - * allocator to allocate larger extents to make them - * contiguous. Unfortunately this brings us to the second - * stupidity, which is that ext4's mballoc code only allocates - * at most 2048 blocks. So we force contiguous writes up to - * the number of dirty blocks in the inode, or - * sbi->max_writeback_mb_bump whichever is smaller. - */ - max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); - if (!range_cyclic && range_whole) { - if (wbc->nr_to_write == LONG_MAX) - desired_nr_to_write = wbc->nr_to_write; - else - desired_nr_to_write = wbc->nr_to_write * 8; - } else - desired_nr_to_write = ext4_num_dirty_pages(inode, index, - max_pages); - if (desired_nr_to_write > max_pages) - desired_nr_to_write = max_pages; - - if (wbc->nr_to_write < desired_nr_to_write) { - nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; - wbc->nr_to_write = desired_nr_to_write; + mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT; + mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT; } + mpd.inode = inode; + mpd.wbc = wbc; + ext4_io_submit_init(&mpd.io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, index, end); - + tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); + done = false; blk_start_plug(&plug); - while (!ret && wbc->nr_to_write > 0) { + while (!done && mpd.first_page <= mpd.last_page) { + /* For each extent of pages we use new io_end */ + mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd.io_submit.io_end) { + ret = -ENOMEM; + break; + } /* - * we insert one extent at a time. So we need - * credit needed for single extent allocation. - * journalled mode is currently not supported - * by delalloc + * We have two constraints: We find one extent to map and we + * must always write out whole page (makes a difference when + * blocksize < pagesize) so that we don't block on IO when we + * try to write out the rest of the page. Journalled mode is + * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); needed_blocks = ext4_da_writepages_trans_blocks(inode); - /* start a new transaction*/ - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - needed_blocks); + /* start a new transaction */ + handle = ext4_journal_start_with_reserve(inode, + EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); - blk_finish_plug(&plug); - goto out_writepages; + /* Release allocated io_end */ + ext4_put_io_end(mpd.io_submit.io_end); + break; } - /* - * Now call write_cache_pages_da() to find the next - * contiguous region of logical blocks that need - * blocks to be allocated by ext4 and submit them. - */ - ret = write_cache_pages_da(handle, mapping, - wbc, &mpd, &done_index); - /* - * If we have a contiguous extent of pages and we - * haven't done the I/O yet, map the blocks and submit - * them for I/O. - */ - if (!mpd.io_done && mpd.next_page != mpd.first_page) { - mpage_da_map_and_submit(&mpd); - ret = MPAGE_DA_EXTENT_TAIL; + trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); + ret = mpage_prepare_extent_to_map(&mpd); + if (!ret) { + if (mpd.map.m_len) + ret = mpage_map_and_submit_extent(handle, &mpd, + &give_up_on_write); + else { + /* + * We scanned the whole range (or exhausted + * nr_to_write), submitted what was mapped and + * didn't find anything needing mapping. We are + * done. + */ + done = true; + } } - trace_ext4_da_write_pages(inode, &mpd); - wbc->nr_to_write -= mpd.pages_written; - ext4_journal_stop(handle); - - if ((mpd.retval == -ENOSPC) && sbi->s_journal) { - /* commit the transaction which would + /* Submit prepared bio */ + ext4_io_submit(&mpd.io_submit); + /* Unlock pages we didn't use */ + mpage_release_unused_pages(&mpd, give_up_on_write); + /* Drop our io_end reference we got from init */ + ext4_put_io_end(mpd.io_submit.io_end); + + if (ret == -ENOSPC && sbi->s_journal) { + /* + * Commit the transaction which would * free blocks released in the transaction * and try again */ jbd2_journal_force_commit_nested(sbi->s_journal); ret = 0; - } else if (ret == MPAGE_DA_EXTENT_TAIL) { - /* - * Got one extent now try with rest of the pages. - * If mpd.retval is set -EIO, journal is aborted. - * So we don't need to write any more. - */ - pages_written += mpd.pages_written; - ret = mpd.retval; - io_done = 1; - } else if (wbc->nr_to_write) - /* - * There is no more writeout needed - * or we requested for a noblocking writeout - * and we found the device congested - */ + continue; + } + /* Fatal error - ENOMEM, EIO... */ + if (ret) break; } blk_finish_plug(&plug); - if (!io_done && !cycled) { + if (!ret && !cycled) { cycled = 1; - index = 0; - wbc->range_start = index << PAGE_CACHE_SHIFT; - wbc->range_end = mapping->writeback_index - 1; + mpd.last_page = writeback_index - 1; + mpd.first_page = 0; goto retry; } /* Update index */ - wbc->range_cyclic = range_cyclic; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* - * set the writeback_index so that range_cyclic + * Set the writeback_index so that range_cyclic * mode will write it back later */ - mapping->writeback_index = done_index; + mapping->writeback_index = mpd.first_page; out_writepages: - wbc->nr_to_write -= nr_to_writebump; - wbc->range_start = range_start; - trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); + trace_ext4_writepages_result(inode, wbc, ret, + nr_to_write - wbc->nr_to_write); return ret; } static int ext4_nonda_switch(struct super_block *sb) { - s64 free_blocks, dirty_blocks; + s64 free_clusters, dirty_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* @@ -2672,17 +2566,18 @@ static int ext4_nonda_switch(struct super_block *sb) * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ - free_blocks = EXT4_C2B(sbi, - percpu_counter_read_positive(&sbi->s_freeclusters_counter)); - dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); + free_clusters = + percpu_counter_read_positive(&sbi->s_freeclusters_counter); + dirty_clusters = + percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); /* * Start pushing delalloc when 1/2 of free blocks are dirty. */ - if (dirty_blocks && (free_blocks < 2 * dirty_blocks)) + if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); - if (2 * free_blocks < 3 * dirty_blocks || - free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { + if (2 * free_clusters < 3 * dirty_clusters || + free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark @@ -2818,18 +2713,9 @@ static int ext4_da_write_end(struct file *file, unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; - if (write_mode == FALL_BACK_TO_NONDELALLOC) { - switch (ext4_inode_journal_mode(inode)) { - case EXT4_INODE_ORDERED_DATA_MODE: - return ext4_ordered_write_end(file, mapping, pos, - len, copied, page, fsdata); - case EXT4_INODE_WRITEBACK_DATA_MODE: - return ext4_writeback_write_end(file, mapping, pos, - len, copied, page, fsdata); - default: - BUG(); - } - } + if (write_mode == FALL_BACK_TO_NONDELALLOC) + return ext4_write_end(file, mapping, pos, + len, copied, page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); start = pos & (PAGE_CACHE_SIZE - 1); @@ -2875,7 +2761,8 @@ static int ext4_da_write_end(struct file *file, return ret ? ret : copied; } -static void ext4_da_invalidatepage(struct page *page, unsigned long offset) +static void ext4_da_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { /* * Drop reserved blocks @@ -2884,10 +2771,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset) if (!page_has_buffers(page)) goto out; - ext4_da_page_release_reservation(page, offset); + ext4_da_page_release_reservation(page, offset, length); out: - ext4_invalidatepage(page, offset); + ext4_invalidatepage(page, offset, length); return; } @@ -2910,7 +2797,7 @@ int ext4_alloc_da_blocks(struct inode *inode) * laptop_mode, not even desirable). However, to do otherwise * would require replicating code paths in: * - * ext4_da_writepages() -> + * ext4_writepages() -> * write_cache_pages() ---> (via passed in callback function) * __mpage_da_writepage() --> * mpage_add_bh_to_extent() @@ -3035,37 +2922,40 @@ ext4_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } -static void ext4_invalidatepage(struct page *page, unsigned long offset) +static void ext4_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { - trace_ext4_invalidatepage(page, offset); + trace_ext4_invalidatepage(page, offset, length); /* No journalling happens on data buffers when this function is used */ WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); - block_invalidatepage(page, offset); + block_invalidatepage(page, offset, length); } static int __ext4_journalled_invalidatepage(struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); - trace_ext4_journalled_invalidatepage(page, offset); + trace_ext4_journalled_invalidatepage(page, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) ClearPageChecked(page); - return jbd2_journal_invalidatepage(journal, page, offset); + return jbd2_journal_invalidatepage(journal, page, offset, length); } /* Wrapper for aops... */ static void ext4_journalled_invalidatepage(struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { - WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); + WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0); } static int ext4_releasepage(struct page *page, gfp_t wait) @@ -3113,9 +3003,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, struct inode *inode = file_inode(iocb->ki_filp); ext4_io_end_t *io_end = iocb->private; - /* if not async direct IO or dio with 0 bytes write, just return */ - if (!io_end || !size) - goto out; + /* if not async direct IO just return */ + if (!io_end) { + inode_dio_done(inode); + if (is_async) + aio_complete(iocb, ret, 0); + return; + } ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", @@ -3123,25 +3017,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, size); iocb->private = NULL; - - /* if not aio dio with unwritten extents, just free io and return */ - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - ext4_free_io_end(io_end); -out: - inode_dio_done(inode); - if (is_async) - aio_complete(iocb, ret, 0); - return; - } - io_end->offset = offset; io_end->size = size; if (is_async) { io_end->iocb = iocb; io_end->result = ret; } - - ext4_add_complete_io(io_end); + ext4_put_io_end_defer(io_end); } /* @@ -3175,6 +3057,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, get_block_t *get_block_func = NULL; int dio_flags = 0; loff_t final_size = offset + count; + ext4_io_end_t *io_end = NULL; /* Use the old path for reads and writes beyond i_size. */ if (rw != WRITE || final_size > inode->i_size) @@ -3182,11 +3065,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, BUG_ON(iocb->private == NULL); + /* + * Make all waiters for direct IO properly wait also for extent + * conversion. This also disallows race between truncate() and + * overwrite DIO as i_dio_count needs to be incremented under i_mutex. + */ + if (rw == WRITE) + atomic_inc(&inode->i_dio_count); + /* If we do a overwrite dio, i_mutex locking can be released */ overwrite = *((int *)iocb->private); if (overwrite) { - atomic_inc(&inode->i_dio_count); down_read(&EXT4_I(inode)->i_data_sem); mutex_unlock(&inode->i_mutex); } @@ -3213,13 +3103,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, iocb->private = NULL; ext4_inode_aio_set(inode, NULL); if (!is_sync_kiocb(iocb)) { - ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); + io_end = ext4_init_io_end(inode, GFP_NOFS); if (!io_end) { ret = -ENOMEM; goto retake_lock; } io_end->flag |= EXT4_IO_END_DIRECT; - iocb->private = io_end; + /* + * Grab reference for DIO. Will be dropped in ext4_end_io_dio() + */ + iocb->private = ext4_get_io_end(io_end); /* * we save the io structure for current async direct * IO, so that later ext4_map_blocks() could flag the @@ -3243,33 +3136,42 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, NULL, dio_flags); - if (iocb->private) - ext4_inode_aio_set(inode, NULL); /* - * The io_end structure takes a reference to the inode, that - * structure needs to be destroyed and the reference to the - * inode need to be dropped, when IO is complete, even with 0 - * byte write, or failed. - * - * In the successful AIO DIO case, the io_end structure will - * be destroyed and the reference to the inode will be dropped - * after the end_io call back function is called. - * - * In the case there is 0 byte write, or error case, since VFS - * direct IO won't invoke the end_io call back function, we - * need to free the end_io structure here. + * Put our reference to io_end. This can free the io_end structure e.g. + * in sync IO case or in case of error. It can even perform extent + * conversion if all bios we submitted finished before we got here. + * Note that in that case iocb->private can be already set to NULL + * here. */ - if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { - ext4_free_io_end(iocb->private); - iocb->private = NULL; - } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, + if (io_end) { + ext4_inode_aio_set(inode, NULL); + ext4_put_io_end(io_end); + /* + * When no IO was submitted ext4_end_io_dio() was not + * called so we have to put iocb's reference. + */ + if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { + WARN_ON(iocb->private != io_end); + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); + WARN_ON(io_end->iocb); + /* + * Generic code already did inode_dio_done() so we + * have to clear EXT4_IO_END_DIRECT to not do it for + * the second time. + */ + io_end->flag = 0; + ext4_put_io_end(io_end); + iocb->private = NULL; + } + } + if (ret > 0 && !overwrite && ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN)) { int err; /* * for non AIO case, since the IO is already * completed, we could do the conversion right here */ - err = ext4_convert_unwritten_extents(inode, + err = ext4_convert_unwritten_extents(NULL, inode, offset, ret); if (err < 0) ret = err; @@ -3277,9 +3179,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, } retake_lock: + if (rw == WRITE) + inode_dio_done(inode); /* take i_mutex locking again if we do a ovewrite dio */ if (overwrite) { - inode_dio_done(inode); up_read(&EXT4_I(inode)->i_data_sem); mutex_lock(&inode->i_mutex); } @@ -3334,27 +3237,13 @@ static int ext4_journalled_set_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); } -static const struct address_space_operations ext4_ordered_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_writepage, - .write_begin = ext4_write_begin, - .write_end = ext4_ordered_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, -}; - -static const struct address_space_operations ext4_writeback_aops = { +static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, + .writepages = ext4_writepages, .write_begin = ext4_write_begin, - .write_end = ext4_writeback_write_end, + .write_end = ext4_write_end, .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, @@ -3368,6 +3257,7 @@ static const struct address_space_operations ext4_journalled_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, + .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, .set_page_dirty = ext4_journalled_set_page_dirty, @@ -3383,7 +3273,7 @@ static const struct address_space_operations ext4_da_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, - .writepages = ext4_da_writepages, + .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, .bmap = ext4_bmap, @@ -3399,108 +3289,73 @@ void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { case EXT4_INODE_ORDERED_DATA_MODE: - if (test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else - inode->i_mapping->a_ops = &ext4_ordered_aops; + ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE); break; case EXT4_INODE_WRITEBACK_DATA_MODE: - if (test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else - inode->i_mapping->a_ops = &ext4_writeback_aops; + ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE); break; case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; - break; + return; default: BUG(); } + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_aops; } - /* - * ext4_discard_partial_page_buffers() - * Wrapper function for ext4_discard_partial_page_buffers_no_lock. - * This function finds and locks the page containing the offset - * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. - * Calling functions that already have the page locked should call - * ext4_discard_partial_page_buffers_no_lock directly. + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. */ -int ext4_discard_partial_page_buffers(handle_t *handle, - struct address_space *mapping, loff_t from, - loff_t length, int flags) +int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) { + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned length; + unsigned blocksize; struct inode *inode = mapping->host; - struct page *page; - int err = 0; - - page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, - mapping_gfp_mask(mapping) & ~__GFP_FS); - if (!page) - return -ENOMEM; - err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, - from, length, flags); + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); - unlock_page(page); - page_cache_release(page); - return err; + return ext4_block_zero_page_range(handle, mapping, from, length); } /* - * ext4_discard_partial_page_buffers_no_lock() - * Zeros a page range of length 'length' starting from offset 'from'. - * Buffer heads that correspond to the block aligned regions of the - * zeroed range will be unmapped. Unblock aligned regions - * will have the corresponding buffer head mapped if needed so that - * that region of the page can be updated with the partial zero out. - * - * This function assumes that the page has already been locked. The - * The range to be discarded must be contained with in the given page. - * If the specified range exceeds the end of the page it will be shortened - * to the end of the page that corresponds to 'from'. This function is - * appropriate for updating a page and it buffer heads to be unmapped and - * zeroed for blocks that have been either released, or are going to be - * released. - * - * handle: The journal handle - * inode: The files inode - * page: A locked page that contains the offset "from" - * from: The starting byte offset (from the beginning of the file) - * to begin discarding - * len: The length of bytes to discard - * flags: Optional flags that may be used: - * - * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED - * Only zero the regions of the page whose buffer heads - * have already been unmapped. This flag is appropriate - * for updating the contents of a page whose blocks may - * have already been released, and we only want to zero - * out the regions that correspond to those released blocks. - * - * Returns zero on success or negative on failure. + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from' */ -static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, - struct inode *inode, struct page *page, loff_t from, - loff_t length, int flags) +int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned int offset = from & (PAGE_CACHE_SIZE-1); - unsigned int blocksize, max, pos; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, max, pos; ext4_lblk_t iblock; + struct inode *inode = mapping->host; struct buffer_head *bh; + struct page *page; int err = 0; - blocksize = inode->i_sb->s_blocksize; - max = PAGE_CACHE_SIZE - offset; + page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, + mapping_gfp_mask(mapping) & ~__GFP_FS); + if (!page) + return -ENOMEM; - if (index != page->index) - return -EINVAL; + blocksize = inode->i_sb->s_blocksize; + max = blocksize - (offset & (blocksize - 1)); /* * correct length if it does not fall between - * 'from' and the end of the page + * 'from' and the end of the block */ if (length > max || length < 0) length = max; @@ -3518,106 +3373,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, iblock++; pos += blocksize; } - - pos = offset; - while (pos < offset + length) { - unsigned int end_of_block, range_to_discard; - - err = 0; - - /* The length of space left to zero and unmap */ - range_to_discard = offset + length - pos; - - /* The length of space until the end of the block */ - end_of_block = blocksize - (pos & (blocksize-1)); - - /* - * Do not unmap or zero past end of block - * for this buffer head - */ - if (range_to_discard > end_of_block) - range_to_discard = end_of_block; - - - /* - * Skip this buffer head if we are only zeroing unampped - * regions of the page - */ - if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED && - buffer_mapped(bh)) - goto next; - - /* If the range is block aligned, unmap */ - if (range_to_discard == blocksize) { - clear_buffer_dirty(bh); - bh->b_bdev = NULL; - clear_buffer_mapped(bh); - clear_buffer_req(bh); - clear_buffer_new(bh); - clear_buffer_delay(bh); - clear_buffer_unwritten(bh); - clear_buffer_uptodate(bh); - zero_user(page, pos, range_to_discard); - BUFFER_TRACE(bh, "Buffer discarded"); - goto next; - } - - /* - * If this block is not completely contained in the range - * to be discarded, then it is not going to be released. Because - * we need to keep this block, we need to make sure this part - * of the page is uptodate before we modify it by writeing - * partial zeros on it. - */ + if (buffer_freed(bh)) { + BUFFER_TRACE(bh, "freed: skip"); + goto unlock; + } + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "unmapped"); + ext4_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ if (!buffer_mapped(bh)) { - /* - * Buffer head must be mapped before we can read - * from the block - */ - BUFFER_TRACE(bh, "unmapped"); - ext4_get_block(inode, iblock, bh, 0); - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "still unmapped"); - goto next; - } + BUFFER_TRACE(bh, "still unmapped"); + goto unlock; } + } - /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) - set_buffer_uptodate(bh); + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); - if (!buffer_uptodate(bh)) { - err = -EIO; - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); - /* Uhhuh. Read error. Complain and punt.*/ - if (!buffer_uptodate(bh)) - goto next; - } + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + if (ext4_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto unlock; + } + zero_user(page, offset, length); + BUFFER_TRACE(bh, "zeroed end of block"); - if (ext4_should_journal_data(inode)) { - BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, bh); - if (err) - goto next; - } + if (ext4_should_journal_data(inode)) { + err = ext4_handle_dirty_metadata(handle, inode, bh); + } else { + err = 0; + mark_buffer_dirty(bh); + if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) + err = ext4_jbd2_file_inode(handle, inode); + } - zero_user(page, pos, range_to_discard); +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} - err = 0; - if (ext4_should_journal_data(inode)) { - err = ext4_handle_dirty_metadata(handle, inode, bh); - } else - mark_buffer_dirty(bh); +int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t length) +{ + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + unsigned partial_start, partial_end; + ext4_fsblk_t start, end; + loff_t byte_end = (lstart + length - 1); + int err = 0; - BUFFER_TRACE(bh, "Partial buffer zeroed"); -next: - bh = bh->b_this_page; - iblock++; - pos += range_to_discard; - } + partial_start = lstart & (sb->s_blocksize - 1); + partial_end = byte_end & (sb->s_blocksize - 1); + start = lstart >> sb->s_blocksize_bits; + end = byte_end >> sb->s_blocksize_bits; + + /* Handle partial zero within the single block */ + if (start == end && + (partial_start || (partial_end != sb->s_blocksize - 1))) { + err = ext4_block_zero_page_range(handle, mapping, + lstart, length); + return err; + } + /* Handle partial zero out on the start of the range */ + if (partial_start) { + err = ext4_block_zero_page_range(handle, mapping, + lstart, sb->s_blocksize); + if (err) + return err; + } + /* Handle partial zero out on the end of the range */ + if (partial_end != sb->s_blocksize - 1) + err = ext4_block_zero_page_range(handle, mapping, + byte_end - partial_end, + partial_end + 1); return err; } @@ -3643,23 +3483,128 @@ int ext4_can_truncate(struct inode *inode) * Returns: 0 on success or negative on failure */ -int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) +int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) { - struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + ext4_lblk_t first_block, stop_block; + struct address_space *mapping = inode->i_mapping; + loff_t first_block_offset, last_block_offset; + handle_t *handle; + unsigned int credits; + int ret = 0; + if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - return ext4_ind_punch_hole(file, offset, length); - - if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { + if (EXT4_SB(sb)->s_cluster_ratio > 1) { /* TODO: Add support for bigalloc file systems */ return -EOPNOTSUPP; } trace_ext4_punch_hole(inode, offset, length); - return ext4_ext_punch_hole(file, offset, length); + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + ret = filemap_write_and_wait_range(mapping, offset, + offset + length - 1); + if (ret) + return ret; + } + + mutex_lock(&inode->i_mutex); + /* It's not possible punch hole on append only file */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + ret = -EPERM; + goto out_mutex; + } + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + goto out_mutex; + } + + /* No need to punch hole beyond i_size */ + if (offset >= inode->i_size) + goto out_mutex; + + /* + * If the hole extends beyond i_size, set the hole + * to end after the page that contains i_size + */ + if (offset + length > inode->i_size) { + length = inode->i_size + + PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - + offset; + } + + first_block_offset = round_up(offset, sb->s_blocksize); + last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; + + /* Now release the pages and zero block aligned part of pages*/ + if (last_block_offset > first_block_offset) + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); + + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + credits = ext4_writepage_trans_blocks(inode); + else + credits = ext4_blocks_for_truncate(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_std_error(sb, ret); + goto out_dio; + } + + ret = ext4_zero_partial_blocks(handle, inode, offset, + length); + if (ret) + goto out_stop; + + first_block = (offset + sb->s_blocksize - 1) >> + EXT4_BLOCK_SIZE_BITS(sb); + stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + + /* If there are no blocks to remove, return now */ + if (first_block >= stop_block) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + ret = ext4_es_remove_extent(inode, first_block, + stop_block - first_block); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ret = ext4_ext_remove_space(inode, first_block, + stop_block - 1); + else + ret = ext4_free_hole_blocks(handle, inode, first_block, + stop_block); + + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; } /* @@ -3692,6 +3637,18 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) */ void ext4_truncate(struct inode *inode) { + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int credits; + handle_t *handle; + struct address_space *mapping = inode->i_mapping; + + /* + * There is a possibility that we're either freeing the inode + * or it completely new indode. In those cases we might not + * have i_mutex locked because it's not necessary. + */ + if (!(inode->i_state & (I_NEW|I_FREEING))) + WARN_ON(!mutex_is_locked(&inode->i_mutex)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) @@ -3711,9 +3668,59 @@ void ext4_truncate(struct inode *inode) } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ext4_ext_truncate(inode); + credits = ext4_writepage_trans_blocks(inode); else - ext4_ind_truncate(inode); + credits = ext4_blocks_for_truncate(inode); + + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); + return; + } + + if (inode->i_size & (inode->i_sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, mapping, inode->i_size); + + /* + * We add the inode to the orphan list, so that if this + * truncate spans multiple transactions, and we crash, we will + * resume the truncate when the filesystem recovers. It also + * marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + + ext4_discard_preallocations(inode); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_ext_truncate(handle, inode); + else + ext4_ind_truncate(handle, inode); + + up_write(&ei->i_data_sem); + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + +out_stop: + /* + * If this was a simple ftruncate() and the file will remain alive, + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); trace_ext4_truncate_exit(inode); } @@ -3821,13 +3828,14 @@ make_io: if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; + __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; table = ext4_inode_table(sb, gdp); /* s_inode_readahead_blks is always a power of 2 */ - b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); + b = block & ~((ext4_fsblk_t) ra_blks - 1); if (table > b) b = table; - end = b + EXT4_SB(sb)->s_inode_readahead_blks; + end = b + ra_blks; num = EXT4_INODES_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) num -= ext4_itable_unused_count(sb, gdp); @@ -4024,8 +4032,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { - if (inode->i_mode == 0 || - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { + if ((inode->i_mode == 0 || + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && + ino != EXT4_BOOT_LOADER_INO) { /* this inode is deleted */ ret = -ESTALE; goto bad_inode; @@ -4033,7 +4042,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) /* The only unlinked inodes we let through here have * valid i_mode and are being read by the orphan * recovery code: that's fine, we're about to complete - * the process of deleting those. */ + * the process of deleting those. + * OR it is the EXT4_BOOT_LOADER_INO which is + * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); @@ -4153,6 +4164,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } else if (ino == EXT4_BOOT_LOADER_INO) { + make_bad_inode(inode); } else { ret = -EIO; EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); @@ -4435,7 +4448,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) inode->i_size >> PAGE_CACHE_SHIFT); if (!page) return; - ret = __ext4_journalled_invalidatepage(page, offset); + ret = __ext4_journalled_invalidatepage(page, offset, + PAGE_CACHE_SIZE - offset); unlock_page(page); page_cache_release(page); if (ret != -EBUSY) @@ -4617,7 +4631,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode; - unsigned long delalloc_blocks; + unsigned long long delalloc_blocks; inode = dentry->d_inode; generic_fillattr(inode, stat); @@ -4635,15 +4649,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), EXT4_I(inode)->i_reserved_data_blocks); - stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; + stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9); return 0; } -static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +static int ext4_index_trans_blocks(struct inode *inode, int lblocks, + int pextents) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return ext4_ind_trans_blocks(inode, nrblocks, chunk); - return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); + return ext4_ind_trans_blocks(inode, lblocks); + return ext4_ext_index_trans_blocks(inode, pextents); } /* @@ -4657,7 +4672,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) * * Also account for superblock, inode, quota and xattr blocks */ -static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; @@ -4665,14 +4681,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) int ret = 0; /* - * How many index blocks need to touch to modify nrblocks? - * The "Chunk" flag indicating whether the nrblocks is - * physically contiguous on disk - * - * For Direct IO and fallocate, they calls get_block to allocate - * one single extent at a time, so they could set the "Chunk" flag + * How many index blocks need to touch to map @lblocks logical blocks + * to @pextents physical extents? */ - idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); + idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); ret = idxblocks; @@ -4680,12 +4692,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) * Now let's see how many group bitmaps and group descriptors need * to account */ - groups = idxblocks; - if (chunk) - groups += 1; - else - groups += nrblocks; - + groups = idxblocks + pextents; gdpblocks = groups; if (groups > ngroups) groups = ngroups; @@ -4716,7 +4723,7 @@ int ext4_writepage_trans_blocks(struct inode *inode) int bpp = ext4_journal_blocks_per_page(inode); int ret; - ret = ext4_meta_trans_blocks(inode, bpp, 0); + ret = ext4_meta_trans_blocks(inode, bpp, bpp); /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 721f4d33e148..9491ac0590f7 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -17,9 +17,201 @@ #include <asm/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" +#include "ext4_extents.h" #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) +/** + * Swap memory between @a and @b for @len bytes. + * + * @a: pointer to first memory area + * @b: pointer to second memory area + * @len: number of bytes to swap + * + */ +static void memswap(void *a, void *b, size_t len) +{ + unsigned char *ap, *bp; + unsigned char tmp; + + ap = (unsigned char *)a; + bp = (unsigned char *)b; + while (len-- > 0) { + tmp = *ap; + *ap = *bp; + *bp = tmp; + ap++; + bp++; + } +} + +/** + * Swap i_data and associated attributes between @inode1 and @inode2. + * This function is used for the primary swap between inode1 and inode2 + * and also to revert this primary swap in case of errors. + * + * Therefore you have to make sure, that calling this method twice + * will revert all changes. + * + * @inode1: pointer to first inode + * @inode2: pointer to second inode + */ +static void swap_inode_data(struct inode *inode1, struct inode *inode2) +{ + loff_t isize; + struct ext4_inode_info *ei1; + struct ext4_inode_info *ei2; + + ei1 = EXT4_I(inode1); + ei2 = EXT4_I(inode2); + + memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags)); + memswap(&inode1->i_version, &inode2->i_version, + sizeof(inode1->i_version)); + memswap(&inode1->i_blocks, &inode2->i_blocks, + sizeof(inode1->i_blocks)); + memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes)); + memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime)); + memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime)); + + memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); + memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags)); + memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); + memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree)); + memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr)); + + isize = i_size_read(inode1); + i_size_write(inode1, i_size_read(inode2)); + i_size_write(inode2, isize); +} + +/** + * Swap the information from the given @inode and the inode + * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other + * important fields of the inodes. + * + * @sb: the super block of the filesystem + * @inode: the inode to swap with EXT4_BOOT_LOADER_INO + * + */ +static long swap_inode_boot_loader(struct super_block *sb, + struct inode *inode) +{ + handle_t *handle; + int err; + struct inode *inode_bl; + struct ext4_inode_info *ei; + struct ext4_inode_info *ei_bl; + struct ext4_sb_info *sbi; + + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { + err = -EINVAL; + goto swap_boot_out; + } + + if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto swap_boot_out; + } + + sbi = EXT4_SB(sb); + ei = EXT4_I(inode); + + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); + if (IS_ERR(inode_bl)) { + err = PTR_ERR(inode_bl); + goto swap_boot_out; + } + ei_bl = EXT4_I(inode_bl); + + filemap_flush(inode->i_mapping); + filemap_flush(inode_bl->i_mapping); + + /* Protect orig inodes against a truncate and make sure, + * that only 1 swap_inode_boot_loader is running. */ + ext4_inode_double_lock(inode, inode_bl); + + truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(&inode_bl->i_data, 0); + + /* Wait for all existing dio workers */ + ext4_inode_block_unlocked_dio(inode); + ext4_inode_block_unlocked_dio(inode_bl); + inode_dio_wait(inode); + inode_dio_wait(inode_bl); + + handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); + if (IS_ERR(handle)) { + err = -EINVAL; + goto swap_boot_out; + } + + /* Protect extent tree against block allocations via delalloc */ + ext4_double_down_write_data_sem(inode, inode_bl); + + if (inode_bl->i_nlink == 0) { + /* this inode has never been used as a BOOT_LOADER */ + set_nlink(inode_bl, 1); + i_uid_write(inode_bl, 0); + i_gid_write(inode_bl, 0); + inode_bl->i_flags = 0; + ei_bl->i_flags = 0; + inode_bl->i_version = 1; + i_size_write(inode_bl, 0); + inode_bl->i_mode = S_IFREG; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_EXTENTS)) { + ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode_bl); + } else + memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); + } + + swap_inode_data(inode, inode_bl); + + inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode); + + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + inode_bl->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + + ext4_discard_preallocations(inode); + + err = ext4_mark_inode_dirty(handle, inode); + if (err < 0) { + ext4_warning(inode->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode->i_ino, err); + /* Revert all changes: */ + swap_inode_data(inode, inode_bl); + } else { + err = ext4_mark_inode_dirty(handle, inode_bl); + if (err < 0) { + ext4_warning(inode_bl->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode_bl->i_ino, err); + /* Revert all changes: */ + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + } + } + + ext4_journal_stop(handle); + + ext4_double_up_write_data_sem(inode, inode_bl); + + ext4_inode_resume_unlocked_dio(inode); + ext4_inode_resume_unlocked_dio(inode_bl); + + ext4_inode_double_unlock(inode, inode_bl); + + iput(inode_bl); + +swap_boot_out: + return err; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -83,17 +275,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } - if (oldflags & EXT4_EXTENTS_FL) { - /* We don't support clearning extent flags */ - if (!(flags & EXT4_EXTENTS_FL)) { - err = -EOPNOTSUPP; - goto flags_out; - } - } else if (flags & EXT4_EXTENTS_FL) { - /* migrate the file */ + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) migrate = 1; - flags &= ~EXT4_EXTENTS_FL; - } if (flags & EXT4_EOFBLOCKS_FL) { /* we don't support adding EOFBLOCKS flag */ @@ -137,8 +320,13 @@ flags_err: err = ext4_change_inode_journal_flag(inode, jflag); if (err) goto flags_out; - if (migrate) - err = ext4_ext_migrate(inode); + if (migrate) { + if (flags & EXT4_EXTENTS_FL) + err = ext4_ext_migrate(inode); + else + err = ext4_ind_migrate(inode); + } + flags_out: mutex_unlock(&inode->i_mutex); mnt_drop_write_file(filp); @@ -357,9 +545,13 @@ group_add_out: return err; } + case EXT4_IOC_SWAP_BOOT: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + return swap_inode_boot_loader(sb, inode); + case EXT4_IOC_RESIZE_FS: { ext4_fsblk_t n_blocks_count; - struct super_block *sb = inode->i_sb; int err = 0, err2 = 0; ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index ee6614bdb639..a9ff5e5137ca 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -405,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr) ext4_clear_bit(bit, addr); } +static inline int mb_test_and_clear_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + return ext4_test_and_clear_bit(bit, addr); +} + static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; @@ -764,6 +770,24 @@ void ext4_mb_generate_buddy(struct super_block *sb, spin_unlock(&EXT4_SB(sb)->s_bal_lock); } +static void mb_regenerate_buddy(struct ext4_buddy *e4b) +{ + int count; + int order = 1; + void *buddy; + + while ((buddy = mb_find_buddy(e4b, order++, &count))) { + ext4_set_bits(buddy, 0, count); + } + e4b->bd_info->bb_fragments = 0; + memset(e4b->bd_info->bb_counters, 0, + sizeof(*e4b->bd_info->bb_counters) * + (e4b->bd_sb->s_blocksize_bits + 2)); + + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, + e4b->bd_bitmap, e4b->bd_group); +} + /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve @@ -860,8 +884,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore) first_block = page->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { - int group; - group = (first_block + i) >> 1; if (group >= ngroups) break; @@ -1011,6 +1033,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) struct page *page; int ret = 0; + might_sleep(); mb_debug(1, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); /* @@ -1082,6 +1105,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; + might_sleep(); mb_debug(1, "load group %u\n", group); blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; @@ -1244,6 +1268,33 @@ static void mb_clear_bits(void *bm, int cur, int len) } } +/* clear bits in given range + * will return first found zero bit if any, -1 otherwise + */ +static int mb_test_and_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + int zero_bit = -1; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + if (*addr != (__u32)(-1) && zero_bit == -1) + zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); + *addr = 0; + cur += 32; + continue; + } + if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) + zero_bit = cur; + cur++; + } + + return zero_bit; +} + void ext4_set_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1262,17 +1313,90 @@ void ext4_set_bits(void *bm, int cur, int len) } } +/* + * _________________________________________________________________ */ + +static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) +{ + if (mb_test_bit(*bit + side, bitmap)) { + mb_clear_bit(*bit, bitmap); + (*bit) -= side; + return 1; + } + else { + (*bit) += side; + mb_set_bit(*bit, bitmap); + return -1; + } +} + +static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) +{ + int max; + int order = 1; + void *buddy = mb_find_buddy(e4b, order, &max); + + while (buddy) { + void *buddy2; + + /* Bits in range [first; last] are known to be set since + * corresponding blocks were allocated. Bits in range + * (first; last) will stay set because they form buddies on + * upper layer. We just deal with borders if they don't + * align with upper layer and then go up. + * Releasing entire group is all about clearing + * single bit of highest order buddy. + */ + + /* Example: + * --------------------------------- + * | 1 | 1 | 1 | 1 | + * --------------------------------- + * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | + * --------------------------------- + * 0 1 2 3 4 5 6 7 + * \_____________________/ + * + * Neither [1] nor [6] is aligned to above layer. + * Left neighbour [0] is free, so mark it busy, + * decrease bb_counters and extend range to + * [0; 6] + * Right neighbour [7] is busy. It can't be coaleasced with [6], so + * mark [6] free, increase bb_counters and shrink range to + * [0; 5]. + * Then shift range to [0; 2], go up and do the same. + */ + + + if (first & 1) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); + if (!(last & 1)) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); + if (first > last) + break; + order++; + + if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { + mb_clear_bits(buddy, first, last - first + 1); + e4b->bd_info->bb_counters[order - 1] += last - first + 1; + break; + } + first >>= 1; + last >>= 1; + buddy = buddy2; + } +} + static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, - int first, int count) + int first, int count) { - int block = 0; - int max = 0; - int order; - void *buddy; - void *buddy2; + int left_is_free = 0; + int right_is_free = 0; + int block; + int last = first + count - 1; struct super_block *sb = e4b->bd_sb; - BUG_ON(first + count > (sb->s_blocksize << 3)); + BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); @@ -1281,67 +1405,54 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; - /* let's maintain fragments counter */ + /* access memory sequentially: check left neighbour, + * clear range and then check right neighbour + */ if (first != 0) - block = !mb_test_bit(first - 1, e4b->bd_bitmap); - if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) - max = !mb_test_bit(first + count, e4b->bd_bitmap); - if (block && max) - e4b->bd_info->bb_fragments--; - else if (!block && !max) - e4b->bd_info->bb_fragments++; + left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); + block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); + if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) + right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); - /* let's maintain buddy itself */ - while (count-- > 0) { - block = first++; - order = 0; + if (unlikely(block != -1)) { + ext4_fsblk_t blocknr; - if (!mb_test_bit(block, e4b->bd_bitmap)) { - ext4_fsblk_t blocknr; - - blocknr = ext4_group_first_block_no(sb, e4b->bd_group); - blocknr += EXT4_C2B(EXT4_SB(sb), block); - ext4_grp_locked_error(sb, e4b->bd_group, - inode ? inode->i_ino : 0, - blocknr, - "freeing already freed block " - "(bit %u)", block); - } - mb_clear_bit(block, e4b->bd_bitmap); - e4b->bd_info->bb_counters[order]++; - - /* start of the buddy */ - buddy = mb_find_buddy(e4b, order, &max); - - do { - block &= ~1UL; - if (mb_test_bit(block, buddy) || - mb_test_bit(block + 1, buddy)) - break; - - /* both the buddies are free, try to coalesce them */ - buddy2 = mb_find_buddy(e4b, order + 1, &max); + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += EXT4_C2B(EXT4_SB(sb), block); + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing already freed block " + "(bit %u)", block); + mb_regenerate_buddy(e4b); + goto done; + } - if (!buddy2) - break; + /* let's maintain fragments counter */ + if (left_is_free && right_is_free) + e4b->bd_info->bb_fragments--; + else if (!left_is_free && !right_is_free) + e4b->bd_info->bb_fragments++; - if (order > 0) { - /* for special purposes, we don't set - * free bits in bitmap */ - mb_set_bit(block, buddy); - mb_set_bit(block + 1, buddy); - } - e4b->bd_info->bb_counters[order]--; - e4b->bd_info->bb_counters[order]--; + /* buddy[0] == bd_bitmap is a special case, so handle + * it right away and let mb_buddy_mark_free stay free of + * zero order checks. + * Check if neighbours are to be coaleasced, + * adjust bitmap bb_counters and borders appropriately. + */ + if (first & 1) { + first += !left_is_free; + e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; + } + if (!(last & 1)) { + last -= !right_is_free; + e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; + } - block = block >> 1; - order++; - e4b->bd_info->bb_counters[order]++; + if (first <= last) + mb_buddy_mark_free(e4b, first >> 1, last >> 1); - mb_clear_bit(block, buddy2); - buddy = buddy2; - } while (1); - } +done: mb_set_largest_free_order(sb, e4b->bd_info); mb_check_buddy(e4b); } @@ -1994,7 +2105,12 @@ repeat: group = ac->ac_g_ex.fe_group; for (i = 0; i < ngroups; group++, i++) { - if (group == ngroups) + cond_resched(); + /* + * Artificially restricted ngroups for non-extent + * files makes group > ngroups possible on first loop. + */ + if (group >= ngroups) group = 0; /* This now checks without needing the buddy page */ @@ -2149,7 +2265,7 @@ static const struct seq_operations ext4_mb_seq_groups_ops = { static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) { - struct super_block *sb = PDE(inode)->data; + struct super_block *sb = PDE_DATA(inode); int rc; rc = seq_open(file, &ext4_mb_seq_groups_ops); @@ -3342,7 +3458,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, if (pa->pa_type == MB_GROUP_PA) grp_blk--; - ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); + grp = ext4_get_group_number(sb, grp_blk); /* * possible race: @@ -3807,7 +3923,7 @@ repeat: list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { BUG_ON(pa->pa_type != MB_INODE_PA); - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { @@ -4069,7 +4185,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + group = ext4_get_group_number(sb, pa->pa_pstart); if (ext4_mb_load_buddy(sb, group, &e4b)) { ext4_error(sb, "Error loading buddy information for %u", group); @@ -4217,6 +4333,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, unsigned int inquota = 0; unsigned int reserv_clstrs = 0; + might_sleep(); sb = ar->inode->i_sb; sbi = EXT4_SB(sb); @@ -4289,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, repeat: /* allocate space in core */ *errp = ext4_mb_regular_allocator(ac); - if (*errp) { - ext4_discard_allocated_blocks(ac); - goto errout; - } + if (*errp) + goto discard_and_exit; /* as we've just preallocated more space than - * user requested orinally, we store allocated + * user requested originally, we store allocated * space in a special descriptor */ if (ac->ac_status == AC_STATUS_FOUND && - ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) - ext4_mb_new_preallocation(ac); + ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) + *errp = ext4_mb_new_preallocation(ac); + if (*errp) { + discard_and_exit: + ext4_discard_allocated_blocks(ac); + goto errout; + } } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); @@ -4420,11 +4540,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, node = rb_prev(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(entry, new_entry)) { + if (can_merge(entry, new_entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { new_entry->efd_start_cluster = entry->efd_start_cluster; new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - ext4_journal_callback_del(handle, &entry->efd_jce); kmem_cache_free(ext4_free_data_cachep, entry); } } @@ -4432,10 +4552,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, node = rb_next(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(new_entry, entry)) { + if (can_merge(new_entry, entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - ext4_journal_callback_del(handle, &entry->efd_jce); kmem_cache_free(ext4_free_data_cachep, entry); } } @@ -4470,6 +4590,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, int err = 0; int ret; + might_sleep(); if (bh) { if (block) BUG_ON(block != bh->b_blocknr); @@ -4495,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, BUG_ON(bh && (count > 1)); for (i = 0; i < count; i++) { + cond_resched(); if (!bh) tbh = sb_find_get_block(inode->i_sb, block + i); - if (unlikely(!tbh)) + if (!tbh) continue; ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, inode, tbh, block + i); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 480acf4a085f..49e8bdff9163 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -426,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode) return retval; } return retval; - } int ext4_ext_migrate(struct inode *inode) @@ -606,3 +605,64 @@ out: return retval; } + +/* + * Migrate a simple extent-based inode to use the i_blocks[] array + */ +int ext4_ind_migrate(struct inode *inode) +{ + struct ext4_extent_header *eh; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_extent *ex; + unsigned int i, len; + ext4_fsblk_t blk; + handle_t *handle; + int ret; + + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS) || + (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EINVAL; + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) + return -EOPNOTSUPP; + + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_check_inode(inode); + if (ret) + goto errout; + + eh = ext_inode_hdr(inode); + ex = EXT_FIRST_EXTENT(eh); + if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || + eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) { + ret = -EOPNOTSUPP; + goto errout; + } + if (eh->eh_entries == 0) + blk = len = 0; + else { + len = le16_to_cpu(ex->ee_len); + blk = ext4_ext_pblock(ex); + if (len > EXT4_NDIR_BLOCKS) { + ret = -EOPNOTSUPP; + goto errout; + } + } + + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + memset(ei->i_data, 0, sizeof(ei->i_data)); + for (i=0; i < len; i++) + ei->i_data[i] = cpu_to_le32(blk++); + ext4_mark_inode_dirty(handle, inode); +errout: + ext4_journal_stop(handle); + up_write(&EXT4_I(inode)->i_data_sem); + return ret; +} diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index f9b551561d2c..214461e42a05 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -7,7 +7,7 @@ #include "ext4.h" /* Checksumming functions */ -static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) +static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) { struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct mmp_struct, mmp_checksum); @@ -54,7 +54,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(WRITE_SYNC, bh); + submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) @@ -86,7 +86,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, get_bh(*bh); lock_buffer(*bh); (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(READ_SYNC, *bh); + submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { brelse(*bh); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 33e1c086858b..e86dddbd8296 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -144,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, } /** - * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem + * ext4_double_down_write_data_sem - Acquire two inodes' write lock + * of i_data_sem * * Acquire write lock of i_data_sem of the two inodes */ -static void -double_down_write_data_sem(struct inode *first, struct inode *second) +void +ext4_double_down_write_data_sem(struct inode *first, struct inode *second) { if (first < second) { down_write(&EXT4_I(first)->i_data_sem); @@ -162,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second) } /** - * double_up_write_data_sem - Release two inodes' write lock of i_data_sem + * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem * * @orig_inode: original inode structure to be released its lock first * @donor_inode: donor inode structure to be released its lock second * Release write lock of i_data_sem of two inodes (orig and donor). */ -static void -double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) +void +ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode) { up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); @@ -407,18 +409,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode, mext_insert_inside_block(o_start, o_end, start_ext, new_ext, end_ext, eh, range_to_move); - if (depth) { - ret = ext4_handle_dirty_metadata(handle, orig_inode, - orig_path->p_bh); - if (ret) - return ret; - } else { - ret = ext4_mark_inode_dirty(handle, orig_inode); - if (ret < 0) - return ret; - } - - return 0; + return ext4_ext_dirty(handle, orig_inode, orig_path); } /** @@ -737,6 +728,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, donor_off += dext_alen; orig_off += dext_alen; + BUG_ON(replaced_count > count); /* Already moved the expected blocks */ if (replaced_count >= count) break; @@ -814,7 +806,13 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, page_cache_release(page[0]); return -ENOMEM; } - + /* + * grab_cache_page_write_begin() may not wait on page's writeback if + * BDI not demand that. But it is reasonable to be very conservative + * here and explicitly wait on page's writeback + */ + wait_on_page_writeback(page[0]); + wait_on_page_writeback(page[1]); if (inode1 > inode2) { struct page *tmp; tmp = page[0]; @@ -856,7 +854,6 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) if (buffer_uptodate(bh)) continue; if (!buffer_mapped(bh)) { - int err = 0; err = ext4_get_block(inode, block, bh, 0); if (err) { SetPageError(page); @@ -915,7 +912,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, struct page *pagep[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset; - long long offs = orig_page_offset << PAGE_CACHE_SHIFT; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int w_flags = 0; unsigned int tmp_data_size, data_size, replaced_size; @@ -943,8 +939,6 @@ again: orig_blk_offset = orig_page_offset * blocks_per_page + data_offset_in_page; - offs = (long long)orig_blk_offset << orig_inode->i_blkbits; - /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { @@ -976,7 +970,7 @@ again: * necessary, just swap data blocks between orig and donor. */ if (uninit) { - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to * fallback to data copying */ uninit = mext_check_coverage(orig_inode, orig_blk_offset, @@ -990,7 +984,7 @@ again: goto drop_data_sem; if (!uninit) { - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } if ((page_has_private(pagep[0]) && @@ -1004,7 +998,7 @@ again: donor_inode, orig_blk_offset, block_len_in_page, err); drop_data_sem: - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); goto unlock_pages; } data_copy: @@ -1033,7 +1027,7 @@ data_copy: } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - *err = __block_write_begin(pagep[0], from, from + replaced_size, + *err = __block_write_begin(pagep[0], from, replaced_size, ext4_get_block); if (!*err) *err = block_commit_write(pagep[0], from, from + replaced_size); @@ -1065,11 +1059,11 @@ repair_branches: * Extents are swapped already, but we are not able to copy data. * Try to swap extents to it's original places */ - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, orig_blk_offset, block_len_in_page, &err2); - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); if (replaced_count != block_len_in_page) { EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), "Unable to copy data block," @@ -1209,15 +1203,15 @@ mext_check_arguments(struct inode *orig_inode, } /** - * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 + * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 * * @inode1: the inode structure * @inode2: the inode structure * * Lock two inodes' i_mutex */ -static void -mext_inode_double_lock(struct inode *inode1, struct inode *inode2) +void +ext4_inode_double_lock(struct inode *inode1, struct inode *inode2) { BUG_ON(inode1 == inode2); if (inode1 < inode2) { @@ -1230,15 +1224,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) } /** - * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 + * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 * * @inode1: the inode that is released first * @inode2: the inode that is released second * */ -static void -mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) +void +ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2) { mutex_unlock(&inode1->i_mutex); mutex_unlock(&inode2->i_mutex); @@ -1333,7 +1327,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, return -EINVAL; } /* Protect orig and donor inodes against a truncate */ - mext_inode_double_lock(orig_inode, donor_inode); + ext4_inode_double_lock(orig_inode, donor_inode); /* Wait for all existing dio workers */ ext4_inode_block_unlocked_dio(orig_inode); @@ -1342,7 +1336,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, inode_dio_wait(donor_inode); /* Protect extent tree against block allocations via delalloc */ - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ ret = mext_check_arguments(orig_inode, donor_inode, orig_start, donor_start, &len); @@ -1466,7 +1460,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, * b. racing with ->readpage, ->write_begin, and ext4_get_block * in move_extent_per_page */ - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); while (orig_page_offset <= seq_end_page) { @@ -1500,7 +1494,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, block_len_in_page = rest_blocks; } - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); if (ret < 0) break; @@ -1538,10 +1532,10 @@ out: ext4_ext_drop_refs(holecheck_path); kfree(holecheck_path); } - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); ext4_inode_resume_unlocked_dio(orig_inode); ext4_inode_resume_unlocked_dio(donor_inode); - mext_inode_double_unlock(orig_inode, donor_inode); + ext4_inode_double_unlock(orig_inode, donor_inode); return ret; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 3825d6aa8336..234b834d5a97 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -416,15 +416,16 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); - __u32 csum, old_csum; + __u32 csum; + __le32 save_csum; int size; size = count_offset + (count * sizeof(struct dx_entry)); - old_csum = t->dt_checksum; + save_csum = t->dt_checksum; t->dt_checksum = 0; csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); - t->dt_checksum = old_csum; + t->dt_checksum = save_csum; return cpu_to_le32(csum); } @@ -917,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, bh->b_data, bh->b_size, (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) + ((char *)de - bh->b_data))) { - /* On error, skip the f_pos to the next block. */ - dir_file->f_pos = (dir_file->f_pos | - (dir->i_sb->s_blocksize - 1)) + 1; - brelse(bh); - return count; + /* silently ignore the rest of the block */ + break; } ext4fs_dirhash(de->name, de->name_len, hinfo); if ((hinfo->hash < start_hash) || @@ -971,6 +969,17 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + count = htree_inlinedir_to_tree(dir_file, dir, 0, + &hinfo, start_hash, + start_minor_hash, + &has_inline_data); + if (has_inline_data) { + *next_hash = ~0; + return count; + } + } count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); *next_hash = ~0; @@ -1455,24 +1464,6 @@ struct dentry *ext4_get_parent(struct dentry *child) return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); } -#define S_SHIFT 12 -static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, - [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, - [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, -}; - -static inline void ext4_set_de_type(struct super_block *sb, - struct ext4_dir_entry_2 *de, - umode_t mode) { - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) - de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; -} - /* * Move count entries from end of map between two memory locations. * Returns pointer to last entry moved. @@ -2251,8 +2242,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, dquot_initialize(dir); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); @@ -2286,8 +2276,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, dquot_initialize(dir); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); @@ -2307,6 +2296,45 @@ retry: return err; } +static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + dquot_initialize(dir); + +retry: + inode = ext4_new_inode_start_handle(dir, mode, + NULL, 0, NULL, + EXT4_HT_DIR, + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + 4 + EXT4_XATTR_TRANS_BLOCKS); + handle = ext4_journal_current_handle(); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext4_file_inode_operations; + inode->i_fop = &ext4_file_operations; + ext4_set_aops(inode); + err = ext4_orphan_add(handle, inode); + if (err) + goto err_drop_inode; + mark_inode_dirty(inode); + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + } + if (handle) + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +err_drop_inode: + ext4_journal_stop(handle); + unlock_new_inode(inode); + iput(inode); + return err; +} + struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, struct ext4_dir_entry_2 *de, int blocksize, int csum_size, @@ -2396,8 +2424,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) dquot_initialize(dir); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, &dentry->d_name, @@ -2826,8 +2853,7 @@ static int ext4_symlink(struct inode *dir, * quota blocks, sb is already counted in previous macros). */ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; } retry: inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, @@ -2916,7 +2942,7 @@ static int ext4_link(struct dentry *old_dentry, retry: handle = ext4_journal_start(dir, EXT4_HT_DIR, (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS)); + EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2930,6 +2956,11 @@ retry: err = ext4_add_entry(handle, dentry, inode); if (!err) { ext4_mark_inode_dirty(handle, inode); + /* this can happen only for tmpfile being + * linked the first time + */ + if (inode->i_nlink == 1) + ext4_orphan_del(handle, inode); d_instantiate(dentry, inode); } else { drop_nlink(inode); @@ -3182,6 +3213,7 @@ const struct inode_operations ext4_dir_inode_operations = { .mkdir = ext4_mkdir, .rmdir = ext4_rmdir, .mknod = ext4_mknod, + .tmpfile = ext4_tmpfile, .rename = ext4_rename, .setattr = ext4_setattr, .setxattr = generic_setxattr, diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 047a6de04a0a..48786cdb5e6c 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -18,6 +18,7 @@ #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/namei.h> +#include <linux/aio.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> @@ -29,82 +30,137 @@ #include "xattr.h" #include "acl.h" -static struct kmem_cache *io_page_cachep, *io_end_cachep; +static struct kmem_cache *io_end_cachep; int __init ext4_init_pageio(void) { - io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); - if (io_page_cachep == NULL) - return -ENOMEM; io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); - if (io_end_cachep == NULL) { - kmem_cache_destroy(io_page_cachep); + if (io_end_cachep == NULL) return -ENOMEM; - } return 0; } void ext4_exit_pageio(void) { kmem_cache_destroy(io_end_cachep); - kmem_cache_destroy(io_page_cachep); } /* - * This function is called by ext4_evict_inode() to make sure there is - * no more pending I/O completion work left to do. + * Print an buffer I/O error compatible with the fs/buffer.c. This + * provides compatibility with dmesg scrapers that look for a specific + * buffer I/O error message. We really need a unified error reporting + * structure to userspace ala Digital Unix's uerf system, but it's + * probably not going to happen in my lifetime, due to LKML politics... */ -void ext4_ioend_shutdown(struct inode *inode) +static void buffer_io_error(struct buffer_head *bh) { - wait_queue_head_t *wq = ext4_ioend_wq(inode); - - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); - /* - * We need to make sure the work structure is finished being - * used before we let the inode get destroyed. - */ - if (work_pending(&EXT4_I(inode)->i_unwritten_work)) - cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); } -static void put_io_page(struct ext4_io_page *io_page) +static void ext4_finish_bio(struct bio *bio) { - if (atomic_dec_and_test(&io_page->p_count)) { - end_page_writeback(io_page->p_page); - put_page(io_page->p_page); - kmem_cache_free(io_page_cachep, io_page); + int i; + int error = !test_bit(BIO_UPTODATE, &bio->bi_flags); + + for (i = 0; i < bio->bi_vcnt; i++) { + struct bio_vec *bvec = &bio->bi_io_vec[i]; + struct page *page = bvec->bv_page; + struct buffer_head *bh, *head; + unsigned bio_start = bvec->bv_offset; + unsigned bio_end = bio_start + bvec->bv_len; + unsigned under_io = 0; + unsigned long flags; + + if (!page) + continue; + + if (error) { + SetPageError(page); + set_bit(AS_EIO, &page->mapping->flags); + } + bh = head = page_buffers(page); + /* + * We check all buffers in the page under BH_Uptodate_Lock + * to avoid races with other end io clearing async_write flags + */ + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); + do { + if (bh_offset(bh) < bio_start || + bh_offset(bh) + bh->b_size > bio_end) { + if (buffer_async_write(bh)) + under_io++; + continue; + } + clear_buffer_async_write(bh); + if (error) + buffer_io_error(bh); + } while ((bh = bh->b_this_page) != head); + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); + local_irq_restore(flags); + if (!under_io) + end_page_writeback(page); } } -void ext4_free_io_end(ext4_io_end_t *io) +static void ext4_release_io_end(ext4_io_end_t *io_end) { - int i; + struct bio *bio, *next_bio; + + BUG_ON(!list_empty(&io_end->list)); + BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); + WARN_ON(io_end->handle); - BUG_ON(!io); - BUG_ON(!list_empty(&io->list)); - BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); + if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) + wake_up_all(ext4_ioend_wq(io_end->inode)); - for (i = 0; i < io->num_io_pages; i++) - put_io_page(io->pages[i]); - io->num_io_pages = 0; - if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) - wake_up_all(ext4_ioend_wq(io->inode)); - kmem_cache_free(io_end_cachep, io); + for (bio = io_end->bio; bio; bio = next_bio) { + next_bio = bio->bi_private; + ext4_finish_bio(bio); + bio_put(bio); + } + if (io_end->flag & EXT4_IO_END_DIRECT) + inode_dio_done(io_end->inode); + if (io_end->iocb) + aio_complete(io_end->iocb, io_end->result, 0); + kmem_cache_free(io_end_cachep, io_end); } -/* check a range of space and convert unwritten extents to written. */ +static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) +{ + struct inode *inode = io_end->inode; + + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); +} + +/* + * Check a range of space and convert unwritten extents to written. Note that + * we are protected from truncate touching same part of extent tree by the + * fact that truncate code waits for all DIO to finish (thus exclusion from + * direct IO is achieved) and also waits for PageWriteback bits. Thus we + * cannot get to ext4_ext_truncate() before all IOs overlapping that range are + * completed (happens from ext4_free_ioend()). + */ static int ext4_end_io(ext4_io_end_t *io) { struct inode *inode = io->inode; loff_t offset = io->offset; ssize_t size = io->size; + handle_t *handle = io->handle; int ret = 0; ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); - ret = ext4_convert_unwritten_extents(inode, offset, size); + io->handle = NULL; /* Following call will use up the handle */ + ret = ext4_convert_unwritten_extents(handle, inode, offset, size); if (ret < 0) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " @@ -112,30 +168,22 @@ static int ext4_end_io(ext4_io_end_t *io) "(inode %lu, offset %llu, size %zd, error %d)", inode->i_ino, offset, size, ret); } - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(inode)); - if (io->flag & EXT4_IO_END_DIRECT) - inode_dio_done(inode); - if (io->iocb) - aio_complete(io->iocb, io->result, 0); + ext4_clear_io_unwritten_flag(io); + ext4_release_io_end(io); return ret; } -static void dump_completed_IO(struct inode *inode) +static void dump_completed_IO(struct inode *inode, struct list_head *head) { #ifdef EXT4FS_DEBUG struct list_head *cur, *before, *after; ext4_io_end_t *io, *io0, *io1; - if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { - ext4_debug("inode %lu completed_io list is empty\n", - inode->i_ino); + if (list_empty(head)) return; - } - ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { + ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); + list_for_each_entry(io, head, list) { cur = &io->list; before = cur->prev; io0 = container_of(before, ext4_io_end_t, list); @@ -149,23 +197,30 @@ static void dump_completed_IO(struct inode *inode) } /* Add the io_end to per-inode completed end_io list. */ -void ext4_add_complete_io(ext4_io_end_t *io_end) +static void ext4_add_complete_io(ext4_io_end_t *io_end) { struct ext4_inode_info *ei = EXT4_I(io_end->inode); struct workqueue_struct *wq; unsigned long flags; BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (list_empty(&ei->i_completed_io_list)) - queue_work(wq, &ei->i_unwritten_work); - list_add_tail(&io_end->list, &ei->i_completed_io_list); + if (io_end->handle) { + wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq; + if (list_empty(&ei->i_rsv_conversion_list)) + queue_work(wq, &ei->i_rsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); + } else { + wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq; + if (list_empty(&ei->i_unrsv_conversion_list)) + queue_work(wq, &ei->i_unrsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list); + } spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); } -static int ext4_do_flush_completed_IO(struct inode *inode) +static int ext4_do_flush_completed_IO(struct inode *inode, + struct list_head *head) { ext4_io_end_t *io; struct list_head unwritten; @@ -174,8 +229,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode) int err, ret = 0; spin_lock_irqsave(&ei->i_completed_io_lock, flags); - dump_completed_IO(inode); - list_replace_init(&ei->i_completed_io_list, &unwritten); + dump_completed_IO(inode, head); + list_replace_init(head, &unwritten); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); while (!list_empty(&unwritten)) { @@ -186,30 +241,25 @@ static int ext4_do_flush_completed_IO(struct inode *inode) err = ext4_end_io(io); if (unlikely(!ret && err)) ret = err; - io->flag &= ~EXT4_IO_END_UNWRITTEN; - ext4_free_io_end(io); } return ret; } /* - * work on completed aio dio IO, to convert unwritten extents to extents + * work on completed IO, to convert unwritten extents to extents */ -void ext4_end_io_work(struct work_struct *work) +void ext4_end_io_rsv_work(struct work_struct *work) { struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, - i_unwritten_work); - ext4_do_flush_completed_IO(&ei->vfs_inode); + i_rsv_conversion_work); + ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); } -int ext4_flush_unwritten_io(struct inode *inode) +void ext4_end_io_unrsv_work(struct work_struct *work) { - int ret; - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && - !(inode->i_state & I_FREEING)); - ret = ext4_do_flush_completed_IO(inode); - ext4_unwritten_wait(inode); - return ret; + struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, + i_unrsv_conversion_work); + ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list); } ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) @@ -219,72 +269,70 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; INIT_LIST_HEAD(&io->list); + atomic_set(&io->count, 1); } return io; } -/* - * Print an buffer I/O error compatible with the fs/buffer.c. This - * provides compatibility with dmesg scrapers that look for a specific - * buffer I/O error message. We really need a unified error reporting - * structure to userspace ala Digital Unix's uerf system, but it's - * probably not going to happen in my lifetime, due to LKML politics... - */ -static void buffer_io_error(struct buffer_head *bh) +void ext4_put_io_end_defer(ext4_io_end_t *io_end) { - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", - bdevname(bh->b_bdev, b), - (unsigned long long)bh->b_blocknr); + if (atomic_dec_and_test(&io_end->count)) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { + ext4_release_io_end(io_end); + return; + } + ext4_add_complete_io(io_end); + } +} + +int ext4_put_io_end(ext4_io_end_t *io_end) +{ + int err = 0; + + if (atomic_dec_and_test(&io_end->count)) { + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + err = ext4_convert_unwritten_extents(io_end->handle, + io_end->inode, io_end->offset, + io_end->size); + io_end->handle = NULL; + ext4_clear_io_unwritten_flag(io_end); + } + ext4_release_io_end(io_end); + } + return err; +} + +ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) +{ + atomic_inc(&io_end->count); + return io_end; } static void ext4_end_bio(struct bio *bio, int error) { ext4_io_end_t *io_end = bio->bi_private; - struct inode *inode; - int i; sector_t bi_sector = bio->bi_sector; BUG_ON(!io_end); - bio->bi_private = NULL; bio->bi_end_io = NULL; if (test_bit(BIO_UPTODATE, &bio->bi_flags)) error = 0; - bio_put(bio); - for (i = 0; i < io_end->num_io_pages; i++) { - struct page *page = io_end->pages[i]->p_page; - struct buffer_head *bh, *head; - loff_t offset; - loff_t io_end_offset; - - if (error) { - SetPageError(page); - set_bit(AS_EIO, &page->mapping->flags); - head = page_buffers(page); - BUG_ON(!head); - - io_end_offset = io_end->offset + io_end->size; - - offset = (sector_t) page->index << PAGE_CACHE_SHIFT; - bh = head; - do { - if ((offset >= io_end->offset) && - (offset+bh->b_size <= io_end_offset)) - buffer_io_error(bh); - - offset += bh->b_size; - bh = bh->b_this_page; - } while (bh != head); - } - - put_io_page(io_end->pages[i]); + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + /* + * Link bio into list hanging from io_end. We have to do it + * atomically as bio completions can be racing against each + * other. + */ + bio->bi_private = xchg(&io_end->bio, bio); + } else { + ext4_finish_bio(bio); + bio_put(bio); } - io_end->num_io_pages = 0; - inode = io_end->inode; if (error) { - io_end->flag |= EXT4_IO_END_ERROR; + struct inode *inode = io_end->inode; + ext4_warning(inode->i_sb, "I/O error writing to inode %lu " "(offset %llu size %ld starting block %llu)", inode->i_ino, @@ -293,13 +341,7 @@ static void ext4_end_bio(struct bio *bio, int error) (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); } - - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - ext4_free_io_end(io_end); - return; - } - - ext4_add_complete_io(io_end); + ext4_put_io_end_defer(io_end); } void ext4_io_submit(struct ext4_io_submit *io) @@ -313,76 +355,53 @@ void ext4_io_submit(struct ext4_io_submit *io) bio_put(io->io_bio); } io->io_bio = NULL; - io->io_op = 0; +} + +void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc) +{ + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + io->io_bio = NULL; io->io_end = NULL; } -static int io_submit_init(struct ext4_io_submit *io, - struct inode *inode, - struct writeback_control *wbc, - struct buffer_head *bh) +static int io_submit_init_bio(struct ext4_io_submit *io, + struct buffer_head *bh) { - ext4_io_end_t *io_end; - struct page *page = bh->b_page; int nvecs = bio_get_nr_vecs(bh->b_bdev); struct bio *bio; - io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) - return -ENOMEM; bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); + if (!bio) + return -ENOMEM; bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; - bio->bi_private = io->io_end = io_end; bio->bi_end_io = ext4_end_bio; - - io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); - + bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); io->io_next_block = bh->b_blocknr; return 0; } static int io_submit_add_bh(struct ext4_io_submit *io, - struct ext4_io_page *io_page, struct inode *inode, - struct writeback_control *wbc, struct buffer_head *bh) { - ext4_io_end_t *io_end; int ret; - if (buffer_new(bh)) { - clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); - } - if (io->io_bio && bh->b_blocknr != io->io_next_block) { submit_and_retry: ext4_io_submit(io); } if (io->io_bio == NULL) { - ret = io_submit_init(io, inode, wbc, bh); + ret = io_submit_init_bio(io, bh); if (ret) return ret; } - io_end = io->io_end; - if ((io_end->num_io_pages >= MAX_IO_PAGES) && - (io_end->pages[io_end->num_io_pages-1] != io_page)) - goto submit_and_retry; - if (buffer_uninit(bh)) - ext4_set_io_unwritten_flag(inode, io_end); - io->io_end->size += bh->b_size; - io->io_next_block++; ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; - if ((io_end->num_io_pages == 0) || - (io_end->pages[io_end->num_io_pages-1] != io_page)) { - io_end->pages[io_end->num_io_pages++] = io_page; - atomic_inc(&io_page->p_count); - } + io->io_next_block++; return 0; } @@ -392,33 +411,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - unsigned block_start, block_end, blocksize; - struct ext4_io_page *io_page; + unsigned block_start, blocksize; struct buffer_head *bh, *head; int ret = 0; + int nr_submitted = 0; blocksize = 1 << inode->i_blkbits; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); - if (!io_page) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return -ENOMEM; - } - io_page->p_page = page; - atomic_set(&io_page->p_count, 1); - get_page(page); set_page_writeback(page); ClearPageError(page); - for (bh = head = page_buffers(page), block_start = 0; - bh != head || !block_start; - block_start = block_end, bh = bh->b_this_page) { - - block_end = block_start + blocksize; + /* + * In the first loop we prepare and mark buffers to submit. We have to + * mark all buffers in the page before submitting so that + * end_page_writeback() cannot be called from ext4_bio_end_io() when IO + * on the first buffer finishes and we are still working on submitting + * the second buffer. + */ + bh = head = page_buffers(page); + do { + block_start = bh_offset(bh); if (block_start >= len) { /* * Comments copied from block_write_full_page_endio: @@ -431,7 +446,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * mapped, and writes to that region are not written * out to the file." */ - zero_user_segment(page, block_start, block_end); + zero_user_segment(page, block_start, + block_start + blocksize); clear_buffer_dirty(bh); set_buffer_uptodate(bh); continue; @@ -445,7 +461,19 @@ int ext4_bio_write_page(struct ext4_io_submit *io, ext4_io_submit(io); continue; } - ret = io_submit_add_bh(io, io_page, inode, wbc, bh); + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + set_buffer_async_write(bh); + } while ((bh = bh->b_this_page) != head); + + /* Now submit buffers to write */ + bh = head = page_buffers(page); + do { + if (!buffer_async_write(bh)) + continue; + ret = io_submit_add_bh(io, inode, bh); if (ret) { /* * We only get here on ENOMEM. Not much else @@ -455,17 +483,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io, redirty_page_for_writepage(wbc, page); break; } + nr_submitted++; clear_buffer_dirty(bh); + } while ((bh = bh->b_this_page) != head); + + /* Error stopped previous loop? Clean up buffers... */ + if (ret) { + do { + clear_buffer_async_write(bh); + bh = bh->b_this_page; + } while (bh != head); } unlock_page(page); - /* - * If the page was truncated before we could do the writeback, - * or we had a memory allocation error while trying to write - * the first buffer head, we won't have submitted any pages for - * I/O. In that case we need to make sure we've cleared the - * PageWriteback bit from the page to prevent the system from - * wedging later on. - */ - put_io_page(io_page); + /* Nothing submitted - we have to end page writeback */ + if (!nr_submitted) + end_page_writeback(page); return ret; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c169477a62c9..c5adbb318a90 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb, ext4_fsblk_t end = start + input->blocks_count; ext4_group_t group = input->group; ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; - unsigned overhead = ext4_group_overhead_blocks(sb, group); - ext4_fsblk_t metaend = start + overhead; + unsigned overhead; + ext4_fsblk_t metaend; struct buffer_head *bh = NULL; ext4_grpblk_t free_blocks_count, offset; int err = -EINVAL; + if (group != sbi->s_groups_count) { + ext4_warning(sb, "Cannot add at group %u (only %u groups)", + input->group, sbi->s_groups_count); + return -EINVAL; + } + + overhead = ext4_group_overhead_blocks(sb, group); + metaend = start + overhead; input->free_blocks_count = free_blocks_count = input->blocks_count - 2 - overhead - sbi->s_itb_per_group; @@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb, free_blocks_count, input->reserved_blocks); ext4_get_group_no_and_offset(sb, start, NULL, &offset); - if (group != sbi->s_groups_count) - ext4_warning(sb, "Cannot add at group %u (only %u groups)", - input->group, sbi->s_groups_count); - else if (offset != 0) + if (offset != 0) ext4_warning(sb, "Last group not full"); else if (input->reserved_blocks > input->blocks_count / 5) ext4_warning(sb, "Reserved blocks too high (%u)", @@ -272,7 +277,7 @@ next_group: if (start_blk >= last_blk) goto next_group; group_data[bb_index].block_bitmap = start_blk++; - ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; if (flexbg_size > 1) @@ -284,7 +289,7 @@ next_group: if (start_blk >= last_blk) goto next_group; group_data[ib_index].inode_bitmap = start_blk++; - ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; if (flexbg_size > 1) @@ -296,7 +301,7 @@ next_group: if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) goto next_group; group_data[it_index].inode_table = start_blk; - ext4_get_group_no_and_offset(sb, start_blk, &group, NULL); + group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count -= EXT4_SB(sb)->s_itb_per_group; @@ -392,7 +397,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, ext4_group_t group; int err; - ext4_get_group_no_and_offset(sb, block, &group, NULL); + group = ext4_get_group_number(sb, block); start = ext4_group_first_block_no(sb, group); group -= flex_gd->groups[0].group; @@ -1341,6 +1346,8 @@ static void ext4_update_super(struct super_block *sb, /* Update the global fs size fields */ sbi->s_groups_count += flex_gd->count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); /* Update the reserved block counts only once the new group is * active. */ @@ -1549,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) int reserved_gdb = ext4_bg_has_super(sb, input->group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; struct inode *inode = NULL; - int gdb_off, gdb_num; + int gdb_off; int err; __u16 bg_flags = 0; - gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, @@ -1654,12 +1660,10 @@ errout: err = err2; if (!err) { - ext4_fsblk_t first_block; - first_block = ext4_group_first_block_no(sb, 0); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu " "blocks\n", ext4_blocks_count(es)); - update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block, + update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, sizeof(struct ext4_super_block), 0); } return err; @@ -1879,7 +1883,11 @@ retry: /* Nothing need to do */ return 0; - ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); + n_group = ext4_get_group_number(sb, n_blocks_count - 1); + if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { + ext4_warning(sb, "resize would cause inodes_count overflow"); + return -EINVAL; + } ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); n_desc_blocks = num_desc_blocks(sb, n_group + 1); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5d6d53578124..85b3dd60169b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -69,6 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb, static void ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); +static int ext4_sync_fs_nojournal(struct super_block *sb, int wait); static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); @@ -81,6 +82,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t); #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext2_fs_type = { @@ -353,10 +355,13 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int error = is_journal_aborted(journal); - struct ext4_journal_cb_entry *jce, *tmp; + struct ext4_journal_cb_entry *jce; + BUG_ON(txn->t_state == T_FINISHED); spin_lock(&sbi->s_md_lock); - list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { + while (!list_empty(&txn->t_private_list)) { + jce = list_entry(txn->t_private_list.next, + struct ext4_journal_cb_entry, jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); jce->jce_func(sb, jce, error); @@ -394,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb) } if (test_opt(sb, ERRORS_RO)) { ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); + /* + * Make sure updated value of ->s_mount_flags will be visible + * before ->s_flags update + */ + smp_wmb(); sb->s_flags |= MS_RDONLY; } if (test_opt(sb, ERRORS_PANIC)) @@ -418,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function, ext4_handle_error(sb); } -void ext4_error_inode(struct inode *inode, const char *function, - unsigned int line, ext4_fsblk_t block, - const char *fmt, ...) +void __ext4_error_inode(struct inode *inode, const char *function, + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) { va_list args; struct va_format vaf; @@ -447,9 +457,9 @@ void ext4_error_inode(struct inode *inode, const char *function, ext4_handle_error(inode->i_sb); } -void ext4_error_file(struct file *file, const char *function, - unsigned int line, ext4_fsblk_t block, - const char *fmt, ...) +void __ext4_error_file(struct file *file, const char *function, + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) { va_list args; struct va_format vaf; @@ -566,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function, if ((sb->s_flags & MS_RDONLY) == 0) { ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); - sb->s_flags |= MS_RDONLY; EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + /* + * Make sure updated value of ->s_mount_flags will be visible + * before ->s_flags update + */ + smp_wmb(); + sb->s_flags |= MS_RDONLY; if (EXT4_SB(sb)->s_journal) jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); save_error_info(sb, function, line); @@ -576,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function, panic("EXT4-fs panic from previous error\n"); } -void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) +void __ext4_msg(struct super_block *sb, + const char *prefix, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -699,22 +715,19 @@ fail: /* * Release the journal device */ -static int ext4_blkdev_put(struct block_device *bdev) +static void ext4_blkdev_put(struct block_device *bdev) { - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } -static int ext4_blkdev_remove(struct ext4_sb_info *sbi) +static void ext4_blkdev_remove(struct ext4_sb_info *sbi) { struct block_device *bdev; - int ret = -ENODEV; - bdev = sbi->journal_bdev; if (bdev) { - ret = ext4_blkdev_put(bdev); + ext4_blkdev_put(bdev); sbi->journal_bdev = NULL; } - return ret; } static inline struct inode *orphan_list_entry(struct list_head *l) @@ -749,8 +762,10 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - flush_workqueue(sbi->dio_unwritten_wq); - destroy_workqueue(sbi->dio_unwritten_wq); + flush_workqueue(sbi->unrsv_conversion_wq); + flush_workqueue(sbi->rsv_conversion_wq); + destroy_workqueue(sbi->unrsv_conversion_wq); + destroy_workqueue(sbi->rsv_conversion_wq); if (sbi->s_journal) { err = jbd2_journal_destroy(sbi->s_journal); @@ -759,7 +774,7 @@ static void ext4_put_super(struct super_block *sb) ext4_abort(sb, "Couldn't clean up the journal"); } - ext4_es_unregister_shrinker(sb); + ext4_es_unregister_shrinker(sbi); del_timer(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); @@ -848,6 +863,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_lru); ei->i_es_lru_nr = 0; + ei->i_touch_when = 0; ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; @@ -858,13 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_reserved_quota = 0; #endif ei->jinode = NULL; - INIT_LIST_HEAD(&ei->i_completed_io_list); + INIT_LIST_HEAD(&ei->i_rsv_conversion_list); + INIT_LIST_HEAD(&ei->i_unrsv_conversion_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_ioend_count, 0); atomic_set(&ei->i_unwritten, 0); - INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work); + INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); + INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work); return &ei->vfs_inode; } @@ -1092,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = { .dirty_inode = ext4_dirty_inode, .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, + .sync_fs = ext4_sync_fs_nojournal, .put_super = ext4_put_super, .statfs = ext4_statfs, .remount_fs = ext4_remount, @@ -1802,7 +1821,7 @@ static int options_seq_show(struct seq_file *seq, void *offset) static int options_open_fs(struct inode *inode, struct file *file) { - return single_open(file, options_seq_show, PDE(inode)->data); + return single_open(file, options_seq_show, PDE_DATA(inode)); } static const struct file_operations ext4_seq_options_fops = { @@ -1907,7 +1926,6 @@ static int ext4_fill_flex_info(struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; ext4_group_t flex_group; - unsigned int groups_per_flex = 0; int i, err; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; @@ -1915,7 +1933,6 @@ static int ext4_fill_flex_info(struct super_block *sb) sbi->s_log_groups_per_flex = 0; return 1; } - groups_per_flex = 1U << sbi->s_log_groups_per_flex; err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); if (err) @@ -1948,16 +1965,16 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, if ((sbi->s_es->s_feature_ro_compat & cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { /* Use new metadata_csum algorithm */ - __u16 old_csum; + __le16 save_csum; __u32 csum32; - old_csum = gdp->bg_checksum; + save_csum = gdp->bg_checksum; gdp->bg_checksum = 0; csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, sbi->s_desc_size); - gdp->bg_checksum = old_csum; + gdp->bg_checksum = save_csum; crc = csum32 & 0xFFFF; goto out; @@ -2163,19 +2180,22 @@ static void ext4_orphan_cleanup(struct super_block *sb, list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); dquot_initialize(inode); if (inode->i_nlink) { - ext4_msg(sb, KERN_DEBUG, - "%s: truncating inode %lu to %lld bytes", - __func__, inode->i_ino, inode->i_size); + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: truncating inode %lu to %lld bytes", + __func__, inode->i_ino, inode->i_size); jbd_debug(2, "truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); mutex_lock(&inode->i_mutex); + truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); mutex_unlock(&inode->i_mutex); nr_truncates++; } else { - ext4_msg(sb, KERN_DEBUG, - "%s: deleting unreferenced inode %lu", - __func__, inode->i_ino); + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: deleting unreferenced inode %lu", + __func__, inode->i_ino); jbd_debug(2, "deleting unreferenced inode %lu\n", inode->i_ino); nr_orphans++; @@ -2376,20 +2396,21 @@ struct ext4_attr { ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, const char *, size_t); - int offset; + union { + int offset; + int deprecated_val; + } u; }; -static int parse_strtoul(const char *buf, - unsigned long max, unsigned long *value) +static int parse_strtoull(const char *buf, + unsigned long long max, unsigned long long *value) { - char *endp; - - *value = simple_strtoul(skip_spaces(buf), &endp, 0); - endp = skip_spaces(endp); - if (*endp || *value > max) - return -EINVAL; + int ret; - return 0; + ret = kstrtoull(skip_spaces(buf), 0, value); + if (!ret && *value > max) + ret = -EINVAL; + return ret; } static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, @@ -2431,11 +2452,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, const char *buf, size_t count) { unsigned long t; + int ret; - if (parse_strtoul(buf, 0x40000000, &t)) - return -EINVAL; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; - if (t && !is_power_of_2(t)) + if (t && (!is_power_of_2(t) || t > 0x40000000)) return -EINVAL; sbi->s_inode_readahead_blks = t; @@ -2445,7 +2468,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, static ssize_t sbi_ui_show(struct ext4_attr *a, struct ext4_sb_info *sbi, char *buf) { - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); return snprintf(buf, PAGE_SIZE, "%u\n", *ui); } @@ -2454,15 +2477,38 @@ static ssize_t sbi_ui_store(struct ext4_attr *a, struct ext4_sb_info *sbi, const char *buf, size_t count) { - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); unsigned long t; + int ret; - if (parse_strtoul(buf, 0xffffffff, &t)) - return -EINVAL; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; *ui = t; return count; } +static ssize_t reserved_clusters_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long) atomic64_read(&sbi->s_resv_clusters)); +} + +static ssize_t reserved_clusters_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long long val; + int ret; + + if (parse_strtoull(buf, -1ULL, &val)) + return -EINVAL; + ret = ext4_reserve_clusters(sbi, val); + + return ret ? ret : count; +} + static ssize_t trigger_test_error(struct ext4_attr *a, struct ext4_sb_info *sbi, const char *buf, size_t count) @@ -2480,12 +2526,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a, return count; } +static ssize_t sbi_deprecated_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val); +} + #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ - .offset = offsetof(struct ext4_sb_info, _elname), \ + .u = { \ + .offset = offsetof(struct ext4_sb_info, _elname),\ + }, \ } #define EXT4_ATTR(name, mode, show, store) \ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) @@ -2496,10 +2550,19 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) #define EXT4_RW_ATTR_SBI_UI(name, elname) \ EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) #define ATTR_LIST(name) &ext4_attr_##name.attr +#define EXT4_DEPRECATED_ATTR(_name, _val) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = sbi_deprecated_show, \ + .u = { \ + .deprecated_val = _val, \ + }, \ +} EXT4_RO_ATTR(delayed_allocation_blocks); EXT4_RO_ATTR(session_write_kbytes); EXT4_RO_ATTR(lifetime_write_kbytes); +EXT4_RW_ATTR(reserved_clusters); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, inode_readahead_blks_store, s_inode_readahead_blks); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); @@ -2509,7 +2572,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); -EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); +EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); @@ -2517,6 +2580,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), ATTR_LIST(session_write_kbytes), ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(reserved_clusters), ATTR_LIST(inode_readahead_blks), ATTR_LIST(inode_goal), ATTR_LIST(mb_stats), @@ -3192,6 +3256,40 @@ int ext4_calculate_overhead(struct super_block *sb) return 0; } + +static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) +{ + ext4_fsblk_t resv_clusters; + + /* + * By default we reserve 2% or 4096 clusters, whichever is smaller. + * This should cover the situations where we can not afford to run + * out of space like for example punch hole, or converting + * uninitialized extents in delalloc path. In most cases such + * allocation would require 1, or 2 blocks, higher numbers are + * very rare. + */ + resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; + + do_div(resv_clusters, 50); + resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); + + return resv_clusters; +} + + +static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count) +{ + ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >> + sbi->s_cluster_bits; + + if (count >= clusters) + return -EINVAL; + + atomic64_set(&sbi->s_resv_clusters, count); + return 0; +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) { char *orig_data = kstrdup(data, GFP_KERNEL); @@ -3526,6 +3624,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); + /* Do we have standard group size of blocksize * 8 blocks ? */ + if (sbi->s_blocks_per_group == blocksize << 3) + set_opt2(sb, STD_GROUP_SIZE); + for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; @@ -3698,6 +3800,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_err_report.function = print_daily_error_info; sbi->s_err_report.data = (unsigned long) sb; + /* Register extent status tree shrinker */ + ext4_es_register_shrinker(sbi); + err = percpu_counter_init(&sbi->s_freeclusters_counter, ext4_count_free_clusters(sb)); if (!err) { @@ -3720,12 +3825,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_stripe = ext4_get_stripe_size(sbi); - sbi->s_max_writeback_mb_bump = 128; sbi->s_extent_max_zeroout_kb = 32; - /* Register extent status tree shrinker */ - ext4_es_register_shrinker(sb); - /* * set up enough so that it can read an inode */ @@ -3851,12 +3952,20 @@ no_journal: * The maximum number of concurrent works can be high and * concurrency isn't really necessary. Limit it to 1. */ - EXT4_SB(sb)->dio_unwritten_wq = - alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); - if (!EXT4_SB(sb)->dio_unwritten_wq) { - printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); + EXT4_SB(sb)->rsv_conversion_wq = + alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!EXT4_SB(sb)->rsv_conversion_wq) { + printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); ret = -ENOMEM; - goto failed_mount_wq; + goto failed_mount4; + } + + EXT4_SB(sb)->unrsv_conversion_wq = + alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!EXT4_SB(sb)->unrsv_conversion_wq) { + printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); + ret = -ENOMEM; + goto failed_mount4; } /* @@ -3911,6 +4020,13 @@ no_journal: "available"); } + err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi)); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " + "reserved pool", ext4_calculate_resv_clusters(sbi)); + goto failed_mount4a; + } + err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " @@ -4003,13 +4119,17 @@ failed_mount4a: sb->s_root = NULL; failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); - destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); + if (EXT4_SB(sb)->rsv_conversion_wq) + destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); + if (EXT4_SB(sb)->unrsv_conversion_wq) + destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq); failed_mount_wq: if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3: + ext4_es_unregister_shrinker(sbi); del_timer(&sbi->s_err_report); if (sbi->s_flex_groups) ext4_kvfree(sbi->s_flex_groups); @@ -4177,7 +4297,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, goto out_bdev; } journal->j_private = sb; - ll_rw_block(READ, 1, &journal->j_sb_buffer); + ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer); wait_on_buffer(journal->j_sb_buffer); if (!buffer_uptodate(journal->j_sb_buffer)) { ext4_msg(sb, KERN_ERR, "I/O error on journal device"); @@ -4445,19 +4565,52 @@ static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; tid_t target; + bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); trace_ext4_sync_fs(sb, wait); - flush_workqueue(sbi->dio_unwritten_wq); + flush_workqueue(sbi->rsv_conversion_wq); + flush_workqueue(sbi->unrsv_conversion_wq); /* * Writeback quota in non-journalled quota case - journalled quota has * no dirty dquots */ dquot_writeback_dquots(sb, -1); + /* + * Data writeback is possible w/o journal transaction, so barrier must + * being sent at the end of the function. But we can skip it if + * transaction_commit will do it for us. + */ + target = jbd2_get_latest_transaction(sbi->s_journal); + if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) + needs_barrier = true; + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { if (wait) - jbd2_log_wait_commit(sbi->s_journal, target); + ret = jbd2_log_wait_commit(sbi->s_journal, target); } + if (needs_barrier) { + int err; + err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + if (!ret) + ret = err; + } + + return ret; +} + +static int ext4_sync_fs_nojournal(struct super_block *sb, int wait) +{ + int ret = 0; + + trace_ext4_sync_fs(sb, wait); + flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq); + flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq); + dquot_writeback_dquots(sb, -1); + if (wait && test_opt(sb, BARRIER)) + ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + return ret; } @@ -4742,9 +4895,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - ext4_fsblk_t overhead = 0; + ext4_fsblk_t overhead = 0, resv_blocks; u64 fsid; s64 bfree; + resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); if (!test_opt(sb, MINIX_DF)) overhead = sbi->s_overhead; @@ -4756,8 +4910,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); - buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); - if (buf->f_bfree < ext4_r_blocks_count(es)) + buf->f_bavail = buf->f_bfree - + (ext4_r_blocks_count(es) + resv_blocks); + if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); @@ -4945,6 +5100,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, return PTR_ERR(qf_inode); } + /* Don't account quota for quota files to avoid recursion */ + qf_inode->i_flags |= S_NOQUOTA; err = dquot_enable(qf_inode, type, format_id, flags); iput(qf_inode); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3a120b277240..c081e34f717f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -122,17 +122,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, struct ext4_xattr_header *hdr) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - __u32 csum, old; + __u32 csum; + __le32 save_csum; + __le64 dsk_block_nr = cpu_to_le64(block_nr); - old = hdr->h_checksum; + save_csum = hdr->h_checksum; hdr->h_checksum = 0; - block_nr = cpu_to_le64(block_nr); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr, - sizeof(block_nr)); + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); csum = ext4_chksum(sbi, csum, (__u8 *)hdr, EXT4_BLOCK_SIZE(inode->i_sb)); - hdr->h_checksum = old; + hdr->h_checksum = save_csum; return cpu_to_le32(csum); } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index aa25deb5c6cd..c767dbdd7fc4 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -22,6 +22,7 @@ #define EXT4_XATTR_INDEX_LUSTRE 5 #define EXT4_XATTR_INDEX_SECURITY 6 #define EXT4_XATTR_INDEX_SYSTEM 7 +#define EXT4_XATTR_INDEX_RICHACL 8 struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index fd27e7e6326e..e06e0995e00f 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -51,3 +51,15 @@ config F2FS_FS_POSIX_ACL Linux website <http://acl.bestbits.at/>. If you don't know what Access Control Lists are, say N + +config F2FS_FS_SECURITY + bool "F2FS Security Labels" + depends on F2FS_FS_XATTR + help + Security labels provide an access control facility to support Linux + Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO + Linux. This option enables an extended attribute handler for file + security labels in the f2fs filesystem, so that it requires enabling + the extended attribute support in advance. + + If you are not using a security module, say N. diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 137af4255da6..b7826ec1b470 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -250,7 +250,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl) } } - error = f2fs_setxattr(inode, name_index, "", value, size); + error = f2fs_setxattr(inode, name_index, "", value, size, NULL); kfree(value); if (!error) @@ -299,7 +299,7 @@ int f2fs_acl_chmod(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct posix_acl *acl; int error; - mode_t mode = get_inode_mode(inode); + umode_t mode = get_inode_mode(inode); if (!test_opt(sbi, POSIX_ACL)) return 0; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2b6fc131e2ce..66a6b85a51d8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -20,6 +20,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include <trace/events/f2fs.h> static struct kmem_cache *orphan_entry_slab; static struct kmem_cache *inode_entry_slab; @@ -57,13 +58,19 @@ repeat: cond_resched(); goto repeat; } - if (f2fs_readpage(sbi, page, index, READ_SYNC)) { + if (PageUptodate(page)) + goto out; + + if (f2fs_readpage(sbi, page, index, READ_SYNC)) + goto repeat; + + lock_page(page); + if (page->mapping != mapping) { f2fs_put_page(page, 1); goto repeat; } +out: mark_page_accessed(page); - - /* We do not allow returning an errorneous page */ return page; } @@ -350,8 +357,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, unsigned long blk_size = sbi->blocksize; struct f2fs_checkpoint *cp_block; unsigned long long cur_version = 0, pre_version = 0; - unsigned int crc = 0; size_t crc_offset; + __u32 crc = 0; /* Read the 1st cp block in this CP pack */ cp_page_1 = get_meta_page(sbi, cp_addr); @@ -362,7 +369,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (crc_offset >= blk_size) goto invalid_cp1; - crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset); + crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset))); if (!f2fs_crc_valid(crc, cp_block, crc_offset)) goto invalid_cp1; @@ -377,7 +384,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (crc_offset >= blk_size) goto invalid_cp2; - crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset); + crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset))); if (!f2fs_crc_valid(crc, cp_block, crc_offset)) goto invalid_cp2; @@ -443,13 +450,30 @@ fail_no_cp: return -EINVAL; } -void set_dirty_dir_page(struct inode *inode, struct page *page) +static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct list_head *head = &sbi->dir_inode_list; - struct dir_inode_entry *new; struct list_head *this; + list_for_each(this, head) { + struct dir_inode_entry *entry; + entry = list_entry(this, struct dir_inode_entry, list); + if (entry->inode == inode) + return -EEXIST; + } + list_add_tail(&new->list, head); +#ifdef CONFIG_F2FS_STAT_FS + sbi->n_dirty_dirs++; +#endif + return 0; +} + +void set_dirty_dir_page(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct dir_inode_entry *new; + if (!S_ISDIR(inode->i_mode)) return; retry: @@ -462,23 +486,31 @@ retry: INIT_LIST_HEAD(&new->list); spin_lock(&sbi->dir_inode_lock); - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); - if (entry->inode == inode) { - kmem_cache_free(inode_entry_slab, new); - goto out; - } - } - list_add_tail(&new->list, head); - sbi->n_dirty_dirs++; + if (__add_dirty_inode(inode, new)) + kmem_cache_free(inode_entry_slab, new); - BUG_ON(!S_ISDIR(inode->i_mode)); -out: inc_page_count(sbi, F2FS_DIRTY_DENTS); inode_inc_dirty_dents(inode); SetPagePrivate(page); + spin_unlock(&sbi->dir_inode_lock); +} + +void add_dirty_dir_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct dir_inode_entry *new; +retry: + new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + if (!new) { + cond_resched(); + goto retry; + } + new->inode = inode; + INIT_LIST_HEAD(&new->list); + spin_lock(&sbi->dir_inode_lock); + if (__add_dirty_inode(inode, new)) + kmem_cache_free(inode_entry_slab, new); spin_unlock(&sbi->dir_inode_lock); } @@ -492,8 +524,10 @@ void remove_dirty_dir_inode(struct inode *inode) return; spin_lock(&sbi->dir_inode_lock); - if (atomic_read(&F2FS_I(inode)->dirty_dents)) - goto out; + if (atomic_read(&F2FS_I(inode)->dirty_dents)) { + spin_unlock(&sbi->dir_inode_lock); + return; + } list_for_each(this, head) { struct dir_inode_entry *entry; @@ -501,12 +535,38 @@ void remove_dirty_dir_inode(struct inode *inode) if (entry->inode == inode) { list_del(&entry->list); kmem_cache_free(inode_entry_slab, entry); +#ifdef CONFIG_F2FS_STAT_FS sbi->n_dirty_dirs--; +#endif + break; + } + } + spin_unlock(&sbi->dir_inode_lock); + + /* Only from the recovery routine */ + if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { + clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); + iput(inode); + } +} + +struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct list_head *head = &sbi->dir_inode_list; + struct list_head *this; + struct inode *inode = NULL; + + spin_lock(&sbi->dir_inode_lock); + list_for_each(this, head) { + struct dir_inode_entry *entry; + entry = list_entry(this, struct dir_inode_entry, list); + if (entry->inode->i_ino == ino) { + inode = entry->inode; break; } } -out: spin_unlock(&sbi->dir_inode_lock); + return inode; } void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) @@ -541,54 +601,44 @@ retry: */ static void block_operations(struct f2fs_sb_info *sbi) { - int t; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; + struct blk_plug plug; - /* Stop renaming operation */ - mutex_lock_op(sbi, RENAME); - mutex_lock_op(sbi, DENTRY_OPS); + blk_start_plug(&plug); -retry_dents: - /* write all the dirty dentry pages */ - sync_dirty_dir_inodes(sbi); +retry_flush_dents: + mutex_lock_all(sbi); - mutex_lock_op(sbi, DATA_WRITE); + /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { - mutex_unlock_op(sbi, DATA_WRITE); - goto retry_dents; + mutex_unlock_all(sbi); + sync_dirty_dir_inodes(sbi); + goto retry_flush_dents; } - /* block all the operations */ - for (t = DATA_NEW; t <= NODE_TRUNC; t++) - mutex_lock_op(sbi, t); - - mutex_lock(&sbi->write_inode); - /* * POR: we should ensure that there is no dirty node pages * until finishing nat/sit flush. */ -retry: - sync_node_pages(sbi, 0, &wbc); - - mutex_lock_op(sbi, NODE_WRITE); +retry_flush_nodes: + mutex_lock(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { - mutex_unlock_op(sbi, NODE_WRITE); - goto retry; + mutex_unlock(&sbi->node_write); + sync_node_pages(sbi, 0, &wbc); + goto retry_flush_nodes; } - mutex_unlock(&sbi->write_inode); + blk_finish_plug(&plug); } static void unblock_operations(struct f2fs_sb_info *sbi) { - int t; - for (t = NODE_WRITE; t >= RENAME; t--) - mutex_unlock_op(sbi, t); + mutex_unlock(&sbi->node_write); + mutex_unlock_all(sbi); } static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) @@ -598,7 +648,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) block_t start_blk; struct page *cp_page; unsigned int data_sum_blocks, orphan_blocks; - unsigned int crc32 = 0; + __u32 crc32 = 0; void *kaddr; int i; @@ -667,8 +717,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset)); - *(__le32 *)((unsigned char *)ckpt + - le32_to_cpu(ckpt->checksum_offset)) + *((__le32 *)((unsigned char *)ckpt + + le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); start_blk = __start_cp_addr(sbi); @@ -727,9 +777,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; + trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops"); + mutex_lock(&sbi->cp_mutex); block_operations(sbi); + trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); + f2fs_submit_bio(sbi, DATA, true); f2fs_submit_bio(sbi, NODE, true); f2fs_submit_bio(sbi, META, true); @@ -746,13 +800,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) flush_nat_entries(sbi); flush_sit_entries(sbi); - reset_victim_segmap(sbi); - /* unlock all the fs_lock[] in do_checkpoint() */ do_checkpoint(sbi, is_umount); unblock_operations(sbi); mutex_unlock(&sbi->cp_mutex); + + trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); } void init_orphan_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7bd22a201125..035f9a345cdf 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -12,6 +12,7 @@ #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> #include <linux/mpage.h> +#include <linux/aio.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> @@ -21,6 +22,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include <trace/events/f2fs.h> /* * Lock ordering for the change of data block address: @@ -54,6 +56,8 @@ int reserve_new_block(struct dnode_of_data *dn) if (!inc_valid_block_count(sbi, dn->inode, 1)) return -ENOSPC; + trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); + __set_data_blkaddr(dn, NEW_ADDR); dn->data_blkaddr = NEW_ADDR; sync_inode_page(dn); @@ -64,7 +68,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, struct buffer_head *bh_result) { struct f2fs_inode_info *fi = F2FS_I(inode); +#ifdef CONFIG_F2FS_STAT_FS struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +#endif pgoff_t start_fofs, end_fofs; block_t start_blkaddr; @@ -74,7 +80,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, return 0; } +#ifdef CONFIG_F2FS_STAT_FS sbi->total_hit_ext++; +#endif start_fofs = fi->ext.fofs; end_fofs = fi->ext.fofs + fi->ext.len - 1; start_blkaddr = fi->ext.blk_addr; @@ -92,7 +100,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, else bh_result->b_size = UINT_MAX; +#ifdef CONFIG_F2FS_STAT_FS sbi->read_hit_ext++; +#endif read_unlock(&fi->ext.ext_lock); return 1; } @@ -133,7 +143,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) goto end_update; } - /* Frone merge */ + /* Front merge */ if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) { fi->ext.fofs--; fi->ext.blk_addr--; @@ -169,7 +179,7 @@ end_update: return; } -struct page *find_data_page(struct inode *inode, pgoff_t index) +struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; @@ -183,7 +193,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) f2fs_put_page(page, 0); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, RDONLY_NODE); + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) return ERR_PTR(err); f2fs_put_dnode(&dn); @@ -195,16 +205,24 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) if (dn.data_blkaddr == NEW_ADDR) return ERR_PTR(-EINVAL); - page = grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); if (!page) return ERR_PTR(-ENOMEM); - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); - return ERR_PTR(err); + if (PageUptodate(page)) { + unlock_page(page); + return page; + } + + err = f2fs_readpage(sbi, page, dn.data_blkaddr, + sync ? READ_SYNC : READA); + if (sync) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + f2fs_put_page(page, 0); + return ERR_PTR(-EIO); + } } - unlock_page(page); return page; } @@ -221,18 +239,23 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) struct page *page; int err; +repeat: + page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); + if (!page) + return ERR_PTR(-ENOMEM); + set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, RDONLY_NODE); - if (err) + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) { + f2fs_put_page(page, 1); return ERR_PTR(err); + } f2fs_put_dnode(&dn); - if (dn.data_blkaddr == NULL_ADDR) + if (dn.data_blkaddr == NULL_ADDR) { + f2fs_put_page(page, 1); return ERR_PTR(-ENOENT); - - page = grab_cache_page(mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); + } if (PageUptodate(page)) return page; @@ -241,9 +264,17 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) BUG_ON(dn.data_blkaddr == NULL_ADDR); err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); + if (err) return ERR_PTR(err); + + lock_page(page); + if (!PageUptodate(page)) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + if (page->mapping != mapping) { + f2fs_put_page(page, 1); + goto repeat; } return page; } @@ -251,9 +282,13 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) /* * Caller ensures that this data page is never allocated. * A new zero-filled data page is allocated in the page cache. + * + * Also, caller should grab and release a mutex by calling mutex_lock_op() and + * mutex_unlock_op(). + * Note that, npage is set only by make_empty_dir. */ -struct page *get_new_data_page(struct inode *inode, pgoff_t index, - bool new_i_size) +struct page *get_new_data_page(struct inode *inode, + struct page *npage, pgoff_t index, bool new_i_size) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; @@ -261,19 +296,21 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index, struct dnode_of_data dn; int err; - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, 0); + set_new_dnode(&dn, inode, npage, npage, 0); + err = get_dnode_of_data(&dn, index, ALLOC_NODE); if (err) return ERR_PTR(err); if (dn.data_blkaddr == NULL_ADDR) { if (reserve_new_block(&dn)) { - f2fs_put_dnode(&dn); + if (!npage) + f2fs_put_dnode(&dn); return ERR_PTR(-ENOSPC); } } - f2fs_put_dnode(&dn); - + if (!npage) + f2fs_put_dnode(&dn); +repeat: page = grab_cache_page(mapping, index); if (!page) return ERR_PTR(-ENOMEM); @@ -283,18 +320,27 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index, if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); } else { err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); + if (err) return ERR_PTR(err); + lock_page(page); + if (!PageUptodate(page)) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + if (page->mapping != mapping) { + f2fs_put_page(page, 1); + goto repeat; } } - SetPageUptodate(page); if (new_i_size && i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); + /* Only the directory inode sets new_i_size */ + set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); mark_inode_dirty_sync(inode); } return page; @@ -325,21 +371,15 @@ static void read_end_io(struct bio *bio, int err) /* * Fill the locked page with data located in the block address. - * Read operation is synchronous, and caller must unlock the page. + * Return unlocked page. */ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, block_t blk_addr, int type) { struct block_device *bdev = sbi->sb->s_bdev; - bool sync = (type == READ_SYNC); struct bio *bio; - /* This page can be already read by other threads */ - if (PageUptodate(page)) { - if (!sync) - unlock_page(page); - return 0; - } + trace_f2fs_readpage(page, blk_addr, type); down_read(&sbi->bio_sem); @@ -354,18 +394,12 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, kfree(bio->bi_private); bio_put(bio); up_read(&sbi->bio_sem); + f2fs_put_page(page, 1); return -EFAULT; } submit_bio(type, bio); up_read(&sbi->bio_sem); - - /* wait for read completion if sync */ - if (sync) { - lock_page(page); - if (PageError(page)) - return -EIO; - } return 0; } @@ -387,14 +421,18 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock, /* Get the page offset from the block offset(iblock) */ pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); - if (check_extent_cache(inode, pgofs, bh_result)) + if (check_extent_cache(inode, pgofs, bh_result)) { + trace_f2fs_get_data_block(inode, iblock, bh_result, 0); return 0; + } /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, RDONLY_NODE); - if (err) + err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + if (err) { + trace_f2fs_get_data_block(inode, iblock, bh_result, err); return (err == -ENOENT) ? 0 : err; + } /* It does not support data allocation */ BUG_ON(create); @@ -419,6 +457,7 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock, bh_result->b_size = (i << blkbits); } f2fs_put_dnode(&dn); + trace_f2fs_get_data_block(inode, iblock, bh_result, 0); return 0; } @@ -437,13 +476,12 @@ static int f2fs_read_data_pages(struct file *file, int do_write_data_page(struct page *page) { struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); block_t old_blk_addr, new_blk_addr; struct dnode_of_data dn; int err = 0; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, RDONLY_NODE); + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) return err; @@ -459,16 +497,15 @@ int do_write_data_page(struct page *page) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (old_blk_addr != NEW_ADDR && !is_cold_data(page) && - need_inplace_update(inode)) { + if (unlikely(old_blk_addr != NEW_ADDR && + !is_cold_data(page) && + need_inplace_update(inode))) { rewrite_data_page(F2FS_SB(inode->i_sb), page, old_blk_addr); } else { write_data_page(inode, page, &dn, old_blk_addr, &new_blk_addr); update_extent_cache(new_blk_addr, &dn); - F2FS_I(inode)->data_version = - le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver); } out_writepage: f2fs_put_dnode(&dn); @@ -484,10 +521,11 @@ static int f2fs_write_data_page(struct page *page, const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_CACHE_SHIFT; unsigned offset; + bool need_balance_fs = false; int err = 0; if (page->index < end_index) - goto out; + goto write; /* * If the offset is out-of-range of file size, @@ -499,50 +537,46 @@ static int f2fs_write_data_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(inode); } - goto unlock_out; + goto out; } zero_user_segment(page, offset, PAGE_CACHE_SIZE); -out: - if (sbi->por_doing) - goto redirty_out; - - if (wbc->for_reclaim && !S_ISDIR(inode->i_mode) && !is_cold_data(page)) +write: + if (sbi->por_doing) { + err = AOP_WRITEPAGE_ACTIVATE; goto redirty_out; + } - mutex_lock_op(sbi, DATA_WRITE); + /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(inode); + err = do_write_data_page(page); + } else { + int ilock = mutex_lock_op(sbi); + err = do_write_data_page(page); + mutex_unlock_op(sbi, ilock); + need_balance_fs = true; } - err = do_write_data_page(page); - if (err && err != -ENOENT) { - wbc->pages_skipped++; - set_page_dirty(page); - } - mutex_unlock_op(sbi, DATA_WRITE); + if (err == -ENOENT) + goto out; + else if (err) + goto redirty_out; if (wbc->for_reclaim) f2fs_submit_bio(sbi, DATA, true); - if (err == -ENOENT) - goto unlock_out; - clear_cold_data(page); +out: unlock_page(page); - - if (!wbc->for_reclaim && !S_ISDIR(inode->i_mode)) + if (need_balance_fs) f2fs_balance_fs(sbi); return 0; -unlock_out: - unlock_page(page); - return (err == -ENOENT) ? 0 : err; - redirty_out: wbc->pages_skipped++; set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; + return err; } #define MAX_DESIRED_PAGES_WP 4096 @@ -561,19 +595,26 @@ static int f2fs_write_data_pages(struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + bool locked = false; int ret; long excess_nrtw = 0, desired_nrtw; + /* deal with chardevs and other special file */ + if (!mapping->a_ops->writepage) + return 0; + if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { desired_nrtw = MAX_DESIRED_PAGES_WP; excess_nrtw = desired_nrtw - wbc->nr_to_write; wbc->nr_to_write = desired_nrtw; } - if (!S_ISDIR(inode->i_mode)) + if (!S_ISDIR(inode->i_mode)) { mutex_lock(&sbi->writepages); + locked = true; + } ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); - if (!S_ISDIR(inode->i_mode)) + if (locked) mutex_unlock(&sbi->writepages); f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); @@ -593,39 +634,33 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; struct dnode_of_data dn; int err = 0; + int ilock; /* for nobh_write_end */ *fsdata = NULL; f2fs_balance_fs(sbi); - +repeat: page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; - mutex_lock_op(sbi, DATA_NEW); + ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, 0); - if (err) { - mutex_unlock_op(sbi, DATA_NEW); - f2fs_put_page(page, 1); - return err; - } + err = get_dnode_of_data(&dn, index, ALLOC_NODE); + if (err) + goto err; - if (dn.data_blkaddr == NULL_ADDR) { + if (dn.data_blkaddr == NULL_ADDR) err = reserve_new_block(&dn); - if (err) { - f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_NEW); - f2fs_put_page(page, 1); - return err; - } - } + f2fs_put_dnode(&dn); + if (err) + goto err; - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) return 0; @@ -636,21 +671,55 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, /* Reading beyond i_size is simple: memset to zero */ zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); - return 0; + goto out; } if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); + if (err) return err; + lock_page(page); + if (!PageUptodate(page)) { + f2fs_put_page(page, 1); + return -EIO; + } + if (page->mapping != mapping) { + f2fs_put_page(page, 1); + goto repeat; } } +out: SetPageUptodate(page); clear_cold_data(page); return 0; + +err: + mutex_unlock_op(sbi, ilock); + f2fs_put_page(page, 1); + return err; +} + +static int f2fs_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + + SetPageUptodate(page); + set_page_dirty(page); + + if (pos + copied > i_size_read(inode)) { + i_size_write(inode, pos + copied); + mark_inode_dirty(inode); + update_inode_page(inode); + } + + unlock_page(page); + page_cache_release(page); + return copied; } static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, @@ -667,7 +736,8 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, get_data_block_ro); } -static void f2fs_invalidate_data_page(struct page *page, unsigned long offset) +static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); @@ -681,7 +751,7 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned long offset) static int f2fs_release_data_page(struct page *page, gfp_t wait) { ClearPagePrivate(page); - return 0; + return 1; } static int f2fs_set_data_page_dirty(struct page *page) @@ -709,7 +779,7 @@ const struct address_space_operations f2fs_dblock_aops = { .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, - .write_end = nobh_write_end, + .write_end = f2fs_write_end, .set_page_dirty = f2fs_set_data_page_dirty, .invalidatepage = f2fs_invalidate_data_page, .releasepage = f2fs_release_data_page, diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 025b9e2f935d..0d6c6aafb235 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -13,7 +13,6 @@ #include <linux/fs.h> #include <linux/backing-dev.h> -#include <linux/proc_fs.h> #include <linux/f2fs_fs.h> #include <linux/blkdev.h> #include <linux/debugfs.h> @@ -106,7 +105,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) } } mutex_unlock(&sit_i->sentry_lock); - dist = sbi->total_sections * hblks_per_sec * hblks_per_sec / 100; + dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; si->bimodal = bimodal / dist; if (si->dirty_count) si->avg_vblocks = total_vblocks / ndirty; @@ -138,14 +137,13 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi); if (sbi->segs_per_sec > 1) - si->base_mem += sbi->total_sections * - sizeof(struct sec_entry); + si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry); si->base_mem += __bitmap_size(sbi, SIT_BITMAP); /* build free segmap */ si->base_mem += sizeof(struct free_segmap_info); si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += f2fs_bitmap_size(sbi->total_sections); + si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); /* build curseg */ si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; @@ -154,7 +152,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build dirty segmap */ si->base_mem += sizeof(struct dirty_seglist_info); si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += 2 * f2fs_bitmap_size(TOTAL_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); /* buld nm */ si->base_mem += sizeof(struct f2fs_nm_info); @@ -177,12 +175,12 @@ get_cache: static int stat_show(struct seq_file *s, void *v) { - struct f2fs_stat_info *si, *next; + struct f2fs_stat_info *si; int i = 0; int j; mutex_lock(&f2fs_stat_mutex); - list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) { + list_for_each_entry(si, &f2fs_stat_list, stat_list) { char devname[BDEVNAME_SIZE]; update_general_status(si->sbi); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a1f38443ecee..62f0d5977c64 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -13,6 +13,7 @@ #include "f2fs.h" #include "node.h" #include "acl.h" +#include "xattr.h" static unsigned long dir_blocks(struct inode *inode) { @@ -60,7 +61,7 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } @@ -148,7 +149,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, for (; bidx < end_block; bidx++) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = find_data_page(dir, bidx); + dentry_page = find_data_page(dir, bidx, true); if (IS_ERR(dentry_page)) { room = true; continue; @@ -189,6 +190,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, unsigned int max_depth; unsigned int level; + if (namelen > F2FS_NAME_LEN) + return NULL; + if (npages == 0) return NULL; @@ -212,9 +216,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) { - struct page *page = NULL; - struct f2fs_dir_entry *de = NULL; - struct f2fs_dentry_block *dentry_blk = NULL; + struct page *page; + struct f2fs_dir_entry *de; + struct f2fs_dentry_block *dentry_blk; page = get_lock_data_page(dir, 0); if (IS_ERR(page)) @@ -246,9 +250,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - - mutex_lock_op(sbi, DENTRY_OPS); lock_page(page); wait_on_page_writeback(page); de->ino = cpu_to_le32(inode->i_ino); @@ -262,18 +263,12 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, F2FS_I(inode)->i_pino = dir->i_ino; f2fs_put_page(page, 1); - mutex_unlock_op(sbi, DENTRY_OPS); } -void init_dent_inode(const struct qstr *name, struct page *ipage) +static void init_dent_inode(const struct qstr *name, struct page *ipage) { struct f2fs_node *rn; - if (IS_ERR(ipage)) - return; - - wait_on_page_writeback(ipage); - /* copy name info. to this inode page */ rn = (struct f2fs_node *)page_address(ipage); rn->i.i_namelen = cpu_to_le32(name->len); @@ -281,64 +276,115 @@ void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -static int init_inode_metadata(struct inode *inode, +static int make_empty_dir(struct inode *inode, + struct inode *parent, struct page *page) +{ + struct page *dentry_page; + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dir_entry *de; + void *kaddr; + + dentry_page = get_new_data_page(inode, page, 0, true); + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); + + kaddr = kmap_atomic(dentry_page); + dentry_blk = (struct f2fs_dentry_block *)kaddr; + + de = &dentry_blk->dentry[0]; + de->name_len = cpu_to_le16(1); + de->hash_code = 0; + de->ino = cpu_to_le32(inode->i_ino); + memcpy(dentry_blk->filename[0], ".", 1); + set_de_type(de, inode); + + de = &dentry_blk->dentry[1]; + de->hash_code = 0; + de->name_len = cpu_to_le16(2); + de->ino = cpu_to_le32(parent->i_ino); + memcpy(dentry_blk->filename[1], "..", 2); + set_de_type(de, inode); + + test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); + test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); + kunmap_atomic(kaddr); + + set_page_dirty(dentry_page); + f2fs_put_page(dentry_page, 1); + return 0; +} + +static struct page *init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *name) { + struct page *page; + int err; + if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { - int err; - err = new_inode_page(inode, name); - if (err) - return err; + page = new_inode_page(inode, name); + if (IS_ERR(page)) + return page; if (S_ISDIR(inode->i_mode)) { - err = f2fs_make_empty(inode, dir); - if (err) { - remove_inode_page(inode); - return err; - } + err = make_empty_dir(inode, dir, page); + if (err) + goto error; } err = f2fs_init_acl(inode, dir); - if (err) { - remove_inode_page(inode); - return err; - } + if (err) + goto error; + + err = f2fs_init_security(inode, dir, name, page); + if (err) + goto error; + + wait_on_page_writeback(page); } else { - struct page *ipage; - ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); - set_cold_node(inode, ipage); - init_dent_inode(name, ipage); - f2fs_put_page(ipage, 1); + page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); + if (IS_ERR(page)) + return page; + + wait_on_page_writeback(page); + set_cold_node(inode, page); } + + init_dent_inode(name, page); + + /* + * This file should be checkpointed during fsync. + * We lost i_pino from now on. + */ if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { + file_lost_pino(inode); inc_nlink(inode); - f2fs_write_inode(inode, NULL); } - return 0; + return page; + +error: + f2fs_put_page(page, 1); + remove_inode_page(inode); + return ERR_PTR(err); } static void update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { - bool need_dir_update = false; - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { if (S_ISDIR(inode->i_mode)) { inc_nlink(dir); - need_dir_update = true; + set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); } clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = CURRENT_TIME; if (F2FS_I(dir)->i_current_depth != current_depth) { F2FS_I(dir)->i_current_depth = current_depth; - need_dir_update = true; + set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); } - if (need_dir_update) - f2fs_write_inode(dir, NULL); + if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) + update_inode_page(dir); else mark_inode_dirty(dir); @@ -370,6 +416,10 @@ next: goto next; } +/* + * Caller should grab and release a mutex by calling mutex_lock_op() and + * mutex_unlock_op(). + */ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode) { unsigned int bit_pos; @@ -379,11 +429,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in f2fs_hash_t dentry_hash; struct f2fs_dir_entry *de; unsigned int nbucket, nblock; - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); size_t namelen = name->len; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; int slots = GET_DENTRY_SLOTS(namelen); + struct page *page; int err = 0; int i; @@ -409,12 +459,9 @@ start: bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { - mutex_lock_op(sbi, DENTRY_OPS); - dentry_page = get_new_data_page(dir, block, true); - if (IS_ERR(dentry_page)) { - mutex_unlock_op(sbi, DENTRY_OPS); + dentry_page = get_new_data_page(dir, NULL, block, true); + if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - } dentry_blk = kmap(dentry_page); bit_pos = room_for_filename(dentry_blk, slots); @@ -423,19 +470,19 @@ start: kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - mutex_unlock_op(sbi, DENTRY_OPS); } /* Move to next level to find the empty slot for new dentry */ ++level; goto start; add_dentry: - err = init_inode_metadata(inode, dir, name); - if (err) - goto fail; - wait_on_page_writeback(dentry_page); + page = init_inode_metadata(inode, dir, name); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } de = &dentry_blk->dentry[bit_pos]; de->hash_code = dentry_hash; de->name_len = cpu_to_le16(namelen); @@ -446,14 +493,16 @@ add_dentry: test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); set_page_dirty(dentry_page); - update_parent_metadata(dir, inode, current_depth); - - /* update parent inode number before releasing dentry page */ + /* we don't need to mark_inode_dirty now */ F2FS_I(inode)->i_pino = dir->i_ino; + update_inode(inode, page); + f2fs_put_page(page, 1); + + update_parent_metadata(dir, inode, current_depth); fail: + clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - mutex_unlock_op(sbi, DENTRY_OPS); return err; } @@ -473,8 +522,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, void *kaddr = page_address(page); int i; - mutex_lock_op(sbi, DENTRY_OPS); - lock_page(page); wait_on_page_writeback(page); @@ -494,7 +541,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (inode && S_ISDIR(inode->i_mode)) { drop_nlink(dir); - f2fs_write_inode(dir, NULL); + update_inode_page(dir); } else { mark_inode_dirty(dir); } @@ -506,7 +553,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, drop_nlink(inode); i_size_write(inode, 0); } - f2fs_write_inode(inode, NULL); + update_inode_page(inode); + if (inode->i_nlink == 0) add_orphan_inode(sbi, inode->i_ino); } @@ -519,45 +567,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, inode_dec_dirty_dents(dir); } f2fs_put_page(page, 1); - - mutex_unlock_op(sbi, DENTRY_OPS); -} - -int f2fs_make_empty(struct inode *inode, struct inode *parent) -{ - struct page *dentry_page; - struct f2fs_dentry_block *dentry_blk; - struct f2fs_dir_entry *de; - void *kaddr; - - dentry_page = get_new_data_page(inode, 0, true); - if (IS_ERR(dentry_page)) - return PTR_ERR(dentry_page); - - kaddr = kmap_atomic(dentry_page); - dentry_blk = (struct f2fs_dentry_block *)kaddr; - - de = &dentry_blk->dentry[0]; - de->name_len = cpu_to_le16(1); - de->hash_code = f2fs_dentry_hash(".", 1); - de->ino = cpu_to_le32(inode->i_ino); - memcpy(dentry_blk->filename[0], ".", 1); - set_de_type(de, inode); - - de = &dentry_blk->dentry[1]; - de->hash_code = f2fs_dentry_hash("..", 2); - de->name_len = cpu_to_le16(2); - de->ino = cpu_to_le32(parent->i_ino); - memcpy(dentry_blk->filename[1], "..", 2); - set_de_type(de, inode); - - test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); - test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); - kunmap_atomic(kaddr); - - set_page_dirty(dentry_page); - f2fs_put_page(dentry_page, 1); - return 0; } bool f2fs_empty_dir(struct inode *dir) @@ -597,34 +606,26 @@ bool f2fs_empty_dir(struct inode *dir) return true; } -static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir) +static int f2fs_readdir(struct file *file, struct dir_context *ctx) { - unsigned long pos = file->f_pos; struct inode *inode = file_inode(file); unsigned long npages = dir_blocks(inode); - unsigned char *types = NULL; - unsigned int bit_pos = 0, start_bit_pos = 0; - int over = 0; + unsigned int bit_pos = 0; struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dir_entry *de = NULL; struct page *dentry_page = NULL; - unsigned int n = 0; + unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); unsigned char d_type = DT_UNKNOWN; - int slots; - types = f2fs_filetype_table; - bit_pos = (pos % NR_DENTRY_IN_BLOCK); - n = (pos / NR_DENTRY_IN_BLOCK); + bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK); for ( ; n < npages; n++) { dentry_page = get_lock_data_page(inode, n); if (IS_ERR(dentry_page)) continue; - start_bit_pos = bit_pos; dentry_blk = kmap(dentry_page); while (bit_pos < NR_DENTRY_IN_BLOCK) { - d_type = DT_UNKNOWN; bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, bit_pos); @@ -632,28 +633,26 @@ static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir) break; de = &dentry_blk->dentry[bit_pos]; - if (types && de->file_type < F2FS_FT_MAX) - d_type = types[de->file_type]; - - over = filldir(dirent, + if (de->file_type < F2FS_FT_MAX) + d_type = f2fs_filetype_table[de->file_type]; + else + d_type = DT_UNKNOWN; + if (!dir_emit(ctx, dentry_blk->filename[bit_pos], le16_to_cpu(de->name_len), - (n * NR_DENTRY_IN_BLOCK) + bit_pos, - le32_to_cpu(de->ino), d_type); - if (over) { - file->f_pos += bit_pos - start_bit_pos; - goto success; - } - slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - bit_pos += slots; + le32_to_cpu(de->ino), d_type)) + goto stop; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos; } bit_pos = 0; - file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK; + ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); dentry_page = NULL; } -success: +stop: if (dentry_page && !IS_ERR(dentry_page)) { kunmap(dentry_page); f2fs_put_page(dentry_page, 1); @@ -665,7 +664,7 @@ success: const struct file_operations f2fs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = f2fs_readdir, + .iterate = f2fs_readdir, .fsync = f2fs_sync_file, .unlocked_ioctl = f2fs_ioctl, }; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cc2213afdcc7..467d42d65c48 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -37,21 +37,35 @@ typecheck(unsigned long long, b) && \ ((long long)((a) - (b)) > 0)) -typedef u64 block_t; +typedef u32 block_t; /* + * should not change u32, since it is the on-disk block + * address format, __le32. + */ typedef u32 nid_t; struct f2fs_mount_info { unsigned int opt; }; -static inline __u32 f2fs_crc32(void *buff, size_t len) +#define CRCPOLY_LE 0xedb88320 + +static inline __u32 f2fs_crc32(void *buf, size_t len) { - return crc32_le(F2FS_SUPER_MAGIC, buff, len); + unsigned char *p = (unsigned char *)buf; + __u32 crc = F2FS_SUPER_MAGIC; + int i; + + while (len--) { + crc ^= *p++; + for (i = 0; i < 8; i++) + crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0); + } + return crc; } -static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size) +static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size) { - return f2fs_crc32(buff, buff_size) == blk_crc; + return f2fs_crc32(buf, buf_size) == blk_crc; } /* @@ -125,11 +139,15 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) * file keeping -1 as its node offset to * distinguish from index node blocks. */ -#define RDONLY_NODE 1 /* - * specify a read-only mode when getting - * a node block. 0 is read-write mode. - * used by get_dnode_of_data(). +enum { + ALLOC_NODE, /* allocate a new node page if needed */ + LOOKUP_NODE, /* look up a node without readahead */ + LOOKUP_NODE_RA, /* + * look up a node with readahead called + * by get_datablock_ro. */ +}; + #define F2FS_LINK_MAX 32000 /* maximum link count per file */ /* for in-memory extent cache entry */ @@ -137,13 +155,14 @@ struct extent_info { rwlock_t ext_lock; /* rwlock for consistency */ unsigned int fofs; /* start offset in a file */ u32 blk_addr; /* start block address of the extent */ - unsigned int len; /* lenth of the extent */ + unsigned int len; /* length of the extent */ }; /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. */ #define FADVISE_COLD_BIT 0x01 +#define FADVISE_LOST_PINO_BIT 0x02 struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ @@ -155,7 +174,6 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ - unsigned long long data_version;/* latest version of data for fsync */ atomic_t dirty_dents; /* # of dirty dentry pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ @@ -186,7 +204,6 @@ static inline void set_raw_extent(struct extent_info *ext, struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ - nid_t init_scan_nid; /* the first nid to be scanned */ nid_t next_scan_nid; /* the next nid to be scanned */ /* NAT cache management */ @@ -305,23 +322,12 @@ enum count_type { }; /* - * FS_LOCK nesting subclasses for the lock validator: - * - * The locking order between these classes is - * RENAME -> DENTRY_OPS -> DATA_WRITE -> DATA_NEW - * -> DATA_TRUNC -> NODE_WRITE -> NODE_NEW -> NODE_TRUNC + * Uses as sbi->fs_lock[NR_GLOBAL_LOCKS]. + * The checkpoint procedure blocks all the locks in this fs_lock array. + * Some FS operations grab free locks, and if there is no free lock, + * then wait to grab a lock in a round-robin manner. */ -enum lock_type { - RENAME, /* for renaming operations */ - DENTRY_OPS, /* for directory operations */ - DATA_WRITE, /* for data write */ - DATA_NEW, /* for data allocation */ - DATA_TRUNC, /* for data truncate */ - NODE_NEW, /* for node allocation */ - NODE_TRUNC, /* for node truncate */ - NODE_WRITE, /* for node write */ - NR_LOCK_TYPE, -}; +#define NR_GLOBAL_LOCKS 8 /* * The below are the page types of bios used in submti_bio(). @@ -361,11 +367,13 @@ struct f2fs_sb_info { /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ struct inode *meta_inode; /* cache meta blocks */ - struct mutex cp_mutex; /* for checkpoint procedure */ - struct mutex fs_lock[NR_LOCK_TYPE]; /* for blocking FS operations */ - struct mutex write_inode; /* mutex for write inode */ + struct mutex cp_mutex; /* checkpoint procedure lock */ + struct mutex fs_lock[NR_GLOBAL_LOCKS]; /* blocking FS operations */ + struct mutex node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ + unsigned char next_lock_num; /* round-robin global locks */ int por_doing; /* recovery is doing or not */ + int on_build_free_nids; /* build_free_nids is doing */ /* for orphan inode management */ struct list_head orphan_inode_list; /* orphan inode list */ @@ -375,7 +383,6 @@ struct f2fs_sb_info { /* for directory inode management */ struct list_head dir_inode_list; /* dir inode list */ spinlock_t dir_inode_lock; /* for dir inode list lock */ - unsigned int n_dirty_dirs; /* # of dir inodes */ /* basic file system units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ @@ -406,17 +413,21 @@ struct f2fs_sb_info { /* for cleaning operations */ struct mutex gc_mutex; /* mutex for GC */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ + unsigned int cur_victim_sec; /* current victim section num */ /* * for stat information. * one is for the LFS mode, and the other is for the SSR mode. */ +#ifdef CONFIG_F2FS_STAT_FS struct f2fs_stat_info *stat_info; /* FS status information */ unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ - unsigned int last_victim[2]; /* last victim segment # */ int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ int bg_gc; /* background gc calls */ + unsigned int n_dirty_dirs; /* # of dir inodes */ +#endif + unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ }; @@ -498,22 +509,59 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) cp->ckpt_flags = cpu_to_le32(ckpt_flags); } -static inline void mutex_lock_op(struct f2fs_sb_info *sbi, enum lock_type t) +static inline void mutex_lock_all(struct f2fs_sb_info *sbi) { - mutex_lock_nested(&sbi->fs_lock[t], t); + int i; + + for (i = 0; i < NR_GLOBAL_LOCKS; i++) { + /* + * This is the only time we take multiple fs_lock[] + * instances; the order is immaterial since we + * always hold cp_mutex, which serializes multiple + * such operations. + */ + mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex); + } } -static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, enum lock_type t) +static inline void mutex_unlock_all(struct f2fs_sb_info *sbi) { - mutex_unlock(&sbi->fs_lock[t]); + int i = 0; + for (; i < NR_GLOBAL_LOCKS; i++) + mutex_unlock(&sbi->fs_lock[i]); +} + +static inline int mutex_lock_op(struct f2fs_sb_info *sbi) +{ + unsigned char next_lock = sbi->next_lock_num % NR_GLOBAL_LOCKS; + int i = 0; + + for (; i < NR_GLOBAL_LOCKS; i++) + if (mutex_trylock(&sbi->fs_lock[i])) + return i; + + mutex_lock(&sbi->fs_lock[next_lock]); + sbi->next_lock_num++; + return next_lock; +} + +static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, int ilock) +{ + if (ilock < 0) + return; + BUG_ON(ilock >= NR_GLOBAL_LOCKS); + mutex_unlock(&sbi->fs_lock[ilock]); } /* * Check whether the given nid is within node id range. */ -static inline void check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - BUG_ON((nid >= NM_I(sbi)->max_nid)); + WARN_ON((nid >= NM_I(sbi)->max_nid)); + if (nid >= NM_I(sbi)->max_nid) + return -EINVAL; + return 0; } #define F2FS_DEFAULT_ALLOCATED_BLOCKS 1 @@ -819,10 +867,12 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr) /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ - FI_NEED_CP, /* need to do checkpoint during fsync */ + FI_DIRTY_INODE, /* indicate inode is dirty or not */ FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ + FI_UPDATE_DIR, /* should update inode block for consistency */ + FI_DELAY_IPUT, /* used for the recovery */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -855,14 +905,21 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag) return 0; } +static inline int f2fs_readonly(struct super_block *sb) +{ + return sb->s_flags & MS_RDONLY; +} + /* * file.c */ int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); void f2fs_truncate(struct inode *); +int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); int truncate_hole(struct inode *, pgoff_t, pgoff_t); +int truncate_data_blocks_range(struct dnode_of_data *, int); long f2fs_ioctl(struct file *, unsigned int, unsigned long); long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); @@ -872,6 +929,7 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); void update_inode(struct inode *, struct page *); +int update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); @@ -889,7 +947,6 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); ino_t f2fs_inode_by_name(struct inode *, struct qstr *); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); -void init_dent_inode(const struct qstr *, struct page *); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); int f2fs_make_empty(struct inode *, struct inode *); @@ -924,8 +981,8 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int remove_inode_page(struct inode *); -int new_inode_page(struct inode *, const struct qstr *); -struct page *new_node_page(struct dnode_of_data *, unsigned int); +struct page *new_inode_page(struct inode *, const struct qstr *); +struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); struct page *get_node_page_ra(struct page *, int); @@ -950,7 +1007,6 @@ void destroy_node_manager_caches(void); */ void f2fs_balance_fs(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); -void locate_dirty_segment(struct f2fs_sb_info *, unsigned int); void clear_prefree_segments(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); @@ -973,7 +1029,6 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *, int, unsigned int, int); void flush_sit_entries(struct f2fs_sb_info *); int build_segment_manager(struct f2fs_sb_info *); -void reset_victim_segmap(struct f2fs_sb_info *); void destroy_segment_manager(struct f2fs_sb_info *); /* @@ -988,7 +1043,9 @@ void remove_orphan_inode(struct f2fs_sb_info *, nid_t); int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void set_dirty_dir_page(struct inode *, struct page *); +void add_dirty_dir_inode(struct inode *); void remove_dirty_dir_inode(struct inode *); +struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t); void sync_dirty_dir_inodes(struct f2fs_sb_info *); void write_checkpoint(struct f2fs_sb_info *, bool); void init_orphan_info(struct f2fs_sb_info *); @@ -1000,9 +1057,9 @@ void destroy_checkpoint_caches(void); */ int reserve_new_block(struct dnode_of_data *); void update_extent_cache(block_t, struct dnode_of_data *); -struct page *find_data_page(struct inode *, pgoff_t); +struct page *find_data_page(struct inode *, pgoff_t, bool); struct page *get_lock_data_page(struct inode *, pgoff_t); -struct page *get_new_data_page(struct inode *, pgoff_t, bool); +struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int); int do_write_data_page(struct page *); @@ -1020,7 +1077,7 @@ void destroy_gc_caches(void); /* * recovery.c */ -void recover_fsync_data(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *); bool space_for_roll_forward(struct f2fs_sb_info *); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 958a46da19ae..d2d2b7dbdcc1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -13,6 +13,7 @@ #include <linux/stat.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/blkdev.h> #include <linux/falloc.h> #include <linux/types.h> #include <linux/compat.h> @@ -24,6 +25,7 @@ #include "segment.h" #include "xattr.h" #include "acl.h" +#include <trace/events/f2fs.h> static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -33,19 +35,18 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); block_t old_blk_addr; struct dnode_of_data dn; - int err; + int err, ilock; f2fs_balance_fs(sbi); sb_start_pagefault(inode->i_sb); - mutex_lock_op(sbi, DATA_NEW); - /* block allocation */ + ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, 0); + err = get_dnode_of_data(&dn, page->index, ALLOC_NODE); if (err) { - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); goto out; } @@ -55,17 +56,17 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, err = reserve_new_block(&dn); if (err) { f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); goto out; } } f2fs_put_dnode(&dn); + mutex_unlock_op(sbi, ilock); - mutex_unlock_op(sbi, DATA_NEW); - + file_update_time(vma->vm_file); lock_page(page); if (page->mapping != inode->i_mapping || - page_offset(page) >= i_size_read(inode) || + page_offset(page) > i_size_read(inode) || !PageUptodate(page)) { unlock_page(page); err = -EFAULT; @@ -76,10 +77,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, * check to see if the page is mapped already (no holes) */ if (PageMappedToDisk(page)) - goto out; - - /* fill the page */ - wait_on_page_writeback(page); + goto mapped; /* page is wholly or partially inside EOF */ if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) { @@ -90,7 +88,9 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, set_page_dirty(page); SetPageUptodate(page); - file_update_time(vma->vm_file); +mapped: + /* fill the page */ + wait_on_page_writeback(page); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(err); @@ -102,28 +102,28 @@ static const struct vm_operations_struct f2fs_file_vm_ops = { .remap_pages = generic_file_remap_pages, }; -static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode) +static int get_parent_ino(struct inode *inode, nid_t *pino) { struct dentry *dentry; - nid_t pino; inode = igrab(inode); dentry = d_find_any_alias(inode); - if (!dentry) { - iput(inode); + iput(inode); + if (!dentry) return 0; - } - pino = dentry->d_parent->d_inode->i_ino; + + inode = igrab(dentry->d_parent->d_inode); dput(dentry); + + *pino = inode->i_ino; iput(inode); - return !is_checkpointed_node(sbi, pino); + return 1; } int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - unsigned long long cur_version; int ret = 0; bool need_cp = false; struct writeback_control wbc = { @@ -132,12 +132,15 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) .for_reclaim = 0, }; - if (inode->i_sb->s_flags & MS_RDONLY) + if (f2fs_readonly(inode->i_sb)) return 0; + trace_f2fs_sync_file_enter(inode); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) + if (ret) { + trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; + } /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); @@ -147,40 +150,44 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out; - mutex_lock(&sbi->cp_mutex); - cur_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver); - mutex_unlock(&sbi->cp_mutex); - - if (F2FS_I(inode)->data_version != cur_version && - !(inode->i_state & I_DIRTY)) - goto out; - F2FS_I(inode)->data_version--; - if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; - else if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP)) + else if (file_wrong_pino(inode)) need_cp = true; else if (!space_for_roll_forward(sbi)) need_cp = true; - else if (need_to_sync_dir(sbi, inode)) + else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) need_cp = true; if (need_cp) { + nid_t pino; + /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); - clear_inode_flag(F2FS_I(inode), FI_NEED_CP); + if (file_wrong_pino(inode) && inode->i_nlink == 1 && + get_parent_ino(inode, &pino)) { + F2FS_I(inode)->i_pino = pino; + file_got_pino(inode); + mark_inode_dirty_sync(inode); + ret = f2fs_write_inode(inode, NULL); + if (ret) + goto out; + } } else { /* if there is no written node page, write its inode page */ while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { + mark_inode_dirty_sync(inode); ret = f2fs_write_inode(inode, NULL); if (ret) goto out; } filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX); + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); } out: mutex_unlock(&inode->i_mutex); + trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; } @@ -191,7 +198,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int truncate_data_blocks_range(struct dnode_of_data *dn, int count) +int truncate_data_blocks_range(struct dnode_of_data *dn, int count) { int nr_free = 0, ofs = dn->ofs_in_node; struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); @@ -208,14 +215,17 @@ static int truncate_data_blocks_range(struct dnode_of_data *dn, int count) update_extent_cache(NULL_ADDR, dn); invalidate_blocks(sbi, blkaddr); - dec_valid_block_count(sbi, dn->inode, 1); nr_free++; } if (nr_free) { + dec_valid_block_count(sbi, dn->inode, nr_free); set_page_dirty(dn->node_page); sync_inode_page(dn); } dn->ofs_in_node = ofs; + + trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, + dn->ofs_in_node, nr_free); return nr_free; } @@ -232,11 +242,15 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) if (!offset) return; - page = find_data_page(inode, from >> PAGE_CACHE_SHIFT); + page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false); if (IS_ERR(page)) return; lock_page(page); + if (page->mapping != inode->i_mapping) { + f2fs_put_page(page, 1); + return; + } wait_on_page_writeback(page); zero_user(page, offset, PAGE_CACHE_SIZE - offset); set_page_dirty(page); @@ -249,20 +263,22 @@ static int truncate_blocks(struct inode *inode, u64 from) unsigned int blocksize = inode->i_sb->s_blocksize; struct dnode_of_data dn; pgoff_t free_from; - int count = 0; + int count = 0, ilock = -1; int err; + trace_f2fs_truncate_blocks_enter(inode, from); + free_from = (pgoff_t) ((from + blocksize - 1) >> (sbi->log_blocksize)); - mutex_lock_op(sbi, DATA_TRUNC); - + ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, free_from, RDONLY_NODE); + err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); if (err) { if (err == -ENOENT) goto free_next; - mutex_unlock_op(sbi, DATA_TRUNC); + mutex_unlock_op(sbi, ilock); + trace_f2fs_truncate_blocks_exit(inode, err); return err; } @@ -273,6 +289,7 @@ static int truncate_blocks(struct inode *inode, u64 from) count -= dn.ofs_in_node; BUG_ON(count < 0); + if (dn.ofs_in_node || IS_INODE(dn.node_page)) { truncate_data_blocks_range(&dn, count); free_from += count; @@ -281,11 +298,12 @@ static int truncate_blocks(struct inode *inode, u64 from) f2fs_put_dnode(&dn); free_next: err = truncate_inode_blocks(inode, free_from); - mutex_unlock_op(sbi, DATA_TRUNC); + mutex_unlock_op(sbi, ilock); /* lastly zero out the first data page */ truncate_partial_data_page(inode, from); + trace_f2fs_truncate_blocks_exit(inode, err); return err; } @@ -295,13 +313,15 @@ void f2fs_truncate(struct inode *inode) S_ISLNK(inode->i_mode))) return; + trace_f2fs_truncate(inode); + if (!truncate_blocks(inode, i_size_read(inode))) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } } -static int f2fs_getattr(struct vfsmount *mnt, +int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; @@ -389,15 +409,16 @@ static void fill_zero(struct inode *inode, pgoff_t index, { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; + int ilock; if (!len) return; f2fs_balance_fs(sbi); - mutex_lock_op(sbi, DATA_NEW); - page = get_new_data_page(inode, index, false); - mutex_unlock_op(sbi, DATA_NEW); + ilock = mutex_lock_op(sbi); + page = get_new_data_page(inode, NULL, index, false); + mutex_unlock_op(sbi, ilock); if (!IS_ERR(page)) { wait_on_page_writeback(page); @@ -414,15 +435,10 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) for (index = pg_start; index < pg_end; index++) { struct dnode_of_data dn; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - - f2fs_balance_fs(sbi); - mutex_lock_op(sbi, DATA_TRUNC); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, RDONLY_NODE); + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) { - mutex_unlock_op(sbi, DATA_TRUNC); if (err == -ENOENT) continue; return err; @@ -431,7 +447,6 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) if (dn.data_blkaddr != NULL_ADDR) truncate_data_blocks_range(&dn, 1); f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_TRUNC); } return 0; } @@ -461,12 +476,19 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) if (pg_start < pg_end) { struct address_space *mapping = inode->i_mapping; loff_t blk_start, blk_end; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int ilock; + + f2fs_balance_fs(sbi); blk_start = pg_start << PAGE_CACHE_SHIFT; blk_end = pg_end << PAGE_CACHE_SHIFT; truncate_inode_pages_range(mapping, blk_start, blk_end - 1); + + ilock = mutex_lock_op(sbi); ret = truncate_hole(inode, pg_start, pg_end); + mutex_unlock_op(sbi, ilock); } } @@ -500,13 +522,13 @@ static int expand_inode_data(struct inode *inode, loff_t offset, for (index = pg_start; index <= pg_end; index++) { struct dnode_of_data dn; + int ilock; - mutex_lock_op(sbi, DATA_NEW); - + ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, index, 0); + ret = get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); break; } @@ -514,13 +536,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, ret = reserve_new_block(&dn); if (ret) { f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); break; } } f2fs_put_dnode(&dn); - - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); if (pg_start == pg_end) new_size = offset + len; @@ -559,6 +580,7 @@ static long f2fs_fallocate(struct file *file, int mode, inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } + trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; } @@ -583,14 +605,14 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) int ret; switch (cmd) { - case FS_IOC_GETFLAGS: + case F2FS_IOC_GETFLAGS: flags = fi->i_flags & FS_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); - case FS_IOC_SETFLAGS: + case F2FS_IOC_SETFLAGS: { unsigned int oldflags; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -627,7 +649,7 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return ret; } default: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 94b8a0c48453..35f9b1a196aa 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -11,7 +11,6 @@ #include <linux/fs.h> #include <linux/module.h> #include <linux/backing-dev.h> -#include <linux/proc_fs.h> #include <linux/init.h> #include <linux/f2fs_fs.h> #include <linux/kthread.h> @@ -23,6 +22,7 @@ #include "node.h" #include "segment.h" #include "gc.h" +#include <trace/events/f2fs.h> static struct kmem_cache *winode_slab; @@ -76,14 +76,13 @@ static int gc_thread_func(void *data) else wait_ms = increase_sleep_time(wait_ms); +#ifdef CONFIG_F2FS_STAT_FS sbi->bg_gc++; +#endif /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi)) wait_ms = GC_THREAD_NOGC_SLEEP_TIME; - else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME) - wait_ms = GC_THREAD_MAX_SLEEP_TIME; - } while (!kthread_should_stop()); return 0; } @@ -92,23 +91,28 @@ int start_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th; dev_t dev = sbi->sb->s_bdev->bd_dev; + int err = 0; if (!test_opt(sbi, BG_GC)) - return 0; + goto out; gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); - if (!gc_th) - return -ENOMEM; + if (!gc_th) { + err = -ENOMEM; + goto out; + } sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { + err = PTR_ERR(gc_th->f2fs_gc_task); kfree(gc_th); sbi->gc_thread = NULL; - return -ENOMEM; } - return 0; + +out: + return err; } void stop_gc_thread(struct f2fs_sb_info *sbi) @@ -131,7 +135,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - if (p->alloc_mode) { + if (p->alloc_mode == SSR) { p->gc_mode = GC_GREEDY; p->dirty_segmap = dirty_i->dirty_segmap[type]; p->ofs_unit = 1; @@ -160,18 +164,21 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno; + unsigned int hint = 0; + unsigned int secno; /* * If the gc_type is FG_GC, we can select victim segments * selected by background GC before. * Those segments guarantee they have small valid blocks. */ - segno = find_next_bit(dirty_i->victim_segmap[BG_GC], - TOTAL_SEGS(sbi), 0); - if (segno < TOTAL_SEGS(sbi)) { - clear_bit(segno, dirty_i->victim_segmap[BG_GC]); - return segno; +next: + secno = find_next_bit(dirty_i->victim_secmap, TOTAL_SECS(sbi), hint++); + if (secno < TOTAL_SECS(sbi)) { + if (sec_usage_check(sbi, secno)) + goto next; + clear_bit(secno, dirty_i->victim_secmap); + return secno * sbi->segs_per_sec; } return NULL_SEGNO; } @@ -222,7 +229,7 @@ static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, } /* - * This function is called from two pathes. + * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. * When it is called during GC, it just gets a victim segment * and it does not remove it from dirty seglist. @@ -234,14 +241,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct victim_sel_policy p; - unsigned int segno; + unsigned int secno, max_cost; int nsearched = 0; p.alloc_mode = alloc_mode; select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; - p.min_cost = get_max_cost(sbi, &p); + p.min_cost = max_cost = get_max_cost(sbi, &p); mutex_lock(&dirty_i->seglist_lock); @@ -253,6 +260,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, while (1) { unsigned long cost; + unsigned int segno; segno = find_next_bit(p.dirty_segmap, TOTAL_SEGS(sbi), p.offset); @@ -265,13 +273,11 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, break; } p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit; + secno = GET_SECNO(sbi, segno); - if (test_bit(segno, dirty_i->victim_segmap[FG_GC])) - continue; - if (gc_type == BG_GC && - test_bit(segno, dirty_i->victim_segmap[BG_GC])) + if (sec_usage_check(sbi, secno)) continue; - if (IS_CURSEC(sbi, GET_SECNO(sbi, segno))) + if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) continue; cost = get_gc_cost(sbi, segno, &p); @@ -281,7 +287,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_cost = cost; } - if (cost == get_max_cost(sbi, &p)) + if (cost == max_cost) continue; if (nsearched++ >= MAX_VICTIM_SEARCH) { @@ -289,15 +295,20 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, break; } } -got_it: if (p.min_segno != NULL_SEGNO) { - *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; +got_it: if (p.alloc_mode == LFS) { - int i; - for (i = 0; i < p.ofs_unit; i++) - set_bit(*result + i, - dirty_i->victim_segmap[gc_type]); + secno = GET_SECNO(sbi, p.min_segno); + if (gc_type == FG_GC) + sbi->cur_victim_sec = secno; + else + set_bit(secno, dirty_i->victim_secmap); } + *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; + + trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, + sbi->cur_victim_sec, + prefree_segments(sbi), free_segments(sbi)); } mutex_unlock(&dirty_i->seglist_lock); @@ -310,28 +321,21 @@ static const struct victim_selection default_v_ops = { static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist) { - struct list_head *this; struct inode_entry *ie; - list_for_each(this, ilist) { - ie = list_entry(this, struct inode_entry, list); + list_for_each_entry(ie, ilist, list) if (ie->inode->i_ino == ino) return ie->inode; - } return NULL; } static void add_gc_inode(struct inode *inode, struct list_head *ilist) { - struct list_head *this; - struct inode_entry *new_ie, *ie; + struct inode_entry *new_ie; - list_for_each(this, ilist) { - ie = list_entry(this, struct inode_entry, list); - if (ie->inode == inode) { - iput(inode); - return; - } + if (inode == find_gc_inode(inode->i_ino, ilist)) { + iput(inode); + return; } repeat: new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS); @@ -381,6 +385,7 @@ static void gc_node_segment(struct f2fs_sb_info *sbi, next_step: entry = sum; + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; @@ -401,11 +406,18 @@ next_step: continue; /* set page dirty and write it */ - if (!PageWriteback(node_page)) + if (gc_type == FG_GC) { + f2fs_submit_bio(sbi, NODE, true); + wait_on_page_writeback(node_page); set_page_dirty(node_page); + } else { + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } f2fs_put_page(node_page, 1); stat_inc_node_blk_count(sbi, 1); } + if (initial) { initial = false; goto next_step; @@ -418,6 +430,13 @@ next_step: .for_reclaim = 0, }; sync_node_pages(sbi, 0, &wbc); + + /* + * In the case of FG_GC, it'd be better to reclaim this victim + * completely. + */ + if (get_valid_blocks(sbi, segno, 1) != 0) + goto next_step; } } @@ -481,21 +500,19 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, static void move_data_page(struct inode *inode, struct page *page, int gc_type) { - if (page->mapping != inode->i_mapping) - goto out; - - if (inode != page->mapping->host) - goto out; - - if (PageWriteback(page)) - goto out; - if (gc_type == BG_GC) { + if (PageWriteback(page)) + goto out; set_page_dirty(page); set_cold_data(page); } else { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - mutex_lock_op(sbi, DATA_WRITE); + + if (PageWriteback(page)) { + f2fs_submit_bio(sbi, DATA, true); + wait_on_page_writeback(page); + } + if (clear_page_dirty_for_io(page) && S_ISDIR(inode->i_mode)) { dec_page_count(sbi, F2FS_DIRTY_DENTS); @@ -503,7 +520,6 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) } set_cold_data(page); do_write_data_page(page); - mutex_unlock_op(sbi, DATA_WRITE); clear_cold_data(page); } out: @@ -530,6 +546,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, next_step: entry = sum; + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { struct page *data_page; struct inode *inode; @@ -567,7 +584,7 @@ next_step: continue; data_page = find_data_page(inode, - start_bidx + ofs_in_node); + start_bidx + ofs_in_node, false); if (IS_ERR(data_page)) goto next_iput; @@ -588,11 +605,22 @@ next_step: next_iput: iput(inode); } + if (++phase < 4) goto next_step; - if (gc_type == FG_GC) + if (gc_type == FG_GC) { f2fs_submit_bio(sbi, DATA, true); + + /* + * In the case of FG_GC, it'd be better to reclaim this victim + * completely. + */ + if (get_valid_blocks(sbi, segno, 1) != 0) { + phase = 2; + goto next_step; + } + } } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, @@ -611,18 +639,15 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, { struct page *sum_page; struct f2fs_summary_block *sum; + struct blk_plug plug; /* read segment summary of victim */ sum_page = get_sum_page(sbi, segno); if (IS_ERR(sum_page)) return; - /* - * CP needs to lock sum_page. In this time, we don't need - * to lock this page, because this summary page is not gone anywhere. - * Also, this page is not gonna be updated before GC is done. - */ - unlock_page(sum_page); + blk_start_plug(&plug); + sum = page_address(sum_page); switch (GET_SUM_TYPE((&sum->footer))) { @@ -633,10 +658,12 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, gc_data_segment(sbi, sum->entries, ilist, segno, gc_type); break; } + blk_finish_plug(&plug); + stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer))); stat_inc_call_count(sbi->stat_info); - f2fs_put_page(sum_page, 0); + f2fs_put_page(sum_page, 1); } int f2fs_gc(struct f2fs_sb_info *sbi) @@ -652,8 +679,10 @@ gc_more: if (!(sbi->sb->s_flags & MS_ACTIVE)) goto stop; - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { gc_type = FG_GC; + write_checkpoint(sbi, false); + } if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) goto stop; @@ -662,9 +691,11 @@ gc_more: for (i = 0; i < sbi->segs_per_sec; i++) do_garbage_collect(sbi, segno + i, &ilist, gc_type); - if (gc_type == FG_GC && - get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) + if (gc_type == FG_GC) { + sbi->cur_victim_sec = NULL_SEGNO; nfree++; + WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec)); + } if (has_not_enough_free_secs(sbi, nfree)) goto gc_more; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 30b2db003acd..2c6a6bd08322 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -13,9 +13,9 @@ * whether IO subsystem is idle * or not */ -#define GC_THREAD_MIN_SLEEP_TIME 10000 /* milliseconds */ -#define GC_THREAD_MAX_SLEEP_TIME 30000 -#define GC_THREAD_NOGC_SLEEP_TIME 10000 +#define GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ +#define GC_THREAD_MAX_SLEEP_TIME 60000 +#define GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ @@ -58,6 +58,9 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) static inline long increase_sleep_time(long wait) { + if (wait == GC_THREAD_NOGC_SLEEP_TIME) + return wait; + wait += GC_THREAD_MIN_SLEEP_TIME; if (wait > GC_THREAD_MAX_SLEEP_TIME) wait = GC_THREAD_MAX_SLEEP_TIME; @@ -66,6 +69,9 @@ static inline long increase_sleep_time(long wait) static inline long decrease_sleep_time(long wait) { + if (wait == GC_THREAD_NOGC_SLEEP_TIME) + wait = GC_THREAD_MAX_SLEEP_TIME; + wait -= GC_THREAD_MIN_SLEEP_TIME; if (wait <= GC_THREAD_MIN_SLEEP_TIME) wait = GC_THREAD_MIN_SLEEP_TIME; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ddae412d30c8..2b2d45d19e3e 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -16,6 +16,8 @@ #include "f2fs.h" #include "node.h" +#include <trace/events/f2fs.h> + void f2fs_set_inode_flags(struct inode *inode) { unsigned int flags = F2FS_I(inode)->i_flags; @@ -44,7 +46,11 @@ static int do_read_inode(struct inode *inode) struct f2fs_inode *ri; /* Check if ino is within scope */ - check_nid_range(sbi, inode->i_ino); + if (check_nid_range(sbi, inode->i_ino)) { + f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", + (unsigned long) inode->i_ino); + return -EINVAL; + } node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) @@ -76,7 +82,6 @@ static int do_read_inode(struct inode *inode) fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); fi->i_flags = le32_to_cpu(ri->i_flags); fi->flags = 0; - fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1; fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); get_extent_info(&fi->ext, ri->i_ext); @@ -88,25 +93,22 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - int ret; + int ret = 0; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + + if (!(inode->i_state & I_NEW)) { + trace_f2fs_iget(inode); return inode; + } if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) goto make_now; ret = do_read_inode(inode); if (ret) goto bad_inode; - - if (!sbi->por_doing && inode->i_nlink == 0) { - ret = -ENOENT; - goto bad_inode; - } - make_now: if (ino == F2FS_NODE_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_node_aops; @@ -122,8 +124,7 @@ make_now: inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE | - __GFP_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &f2fs_symlink_inode_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; @@ -136,11 +137,12 @@ make_now: goto bad_inode; } unlock_new_inode(inode); - + trace_f2fs_iget(inode); return inode; bad_inode: iget_failed(inode); + trace_f2fs_iget_exit(inode, ret); return ERR_PTR(ret); } @@ -190,49 +192,57 @@ void update_inode(struct inode *inode, struct page *node_page) set_cold_node(inode, node_page); set_page_dirty(node_page); + clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); } -int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) +int update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *node_page; - bool need_lock = false; - - if (inode->i_ino == F2FS_NODE_INO(sbi) || - inode->i_ino == F2FS_META_INO(sbi)) - return 0; - - if (wbc) - f2fs_balance_fs(sbi); node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); - if (!PageDirty(node_page)) { - need_lock = true; - f2fs_put_page(node_page, 1); - mutex_lock(&sbi->write_inode); - node_page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(node_page)) { - mutex_unlock(&sbi->write_inode); - return PTR_ERR(node_page); - } - } update_inode(inode, node_page); f2fs_put_page(node_page, 1); - if (need_lock) - mutex_unlock(&sbi->write_inode); return 0; } +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int ret, ilock; + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return 0; + + if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE)) + return 0; + + if (wbc) + f2fs_balance_fs(sbi); + + /* + * We need to lock here to prevent from producing dirty node pages + * during the urgent cleaning time when runing out of free sections. + */ + ilock = mutex_lock_op(sbi); + ret = update_inode_page(inode); + mutex_unlock_op(sbi, ilock); + return ret; +} + /* * Called at the last iput() if i_nlink is zero */ void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int ilock; + trace_f2fs_evict_inode(inode); truncate_inode_pages(&inode->i_data, 0); if (inode->i_ino == F2FS_NODE_INO(sbi) || @@ -252,7 +262,10 @@ void f2fs_evict_inode(struct inode *inode) if (F2FS_HAS_BLOCKS(inode)) f2fs_truncate(inode); + ilock = mutex_lock_op(sbi); remove_inode_page(inode); + mutex_unlock_op(sbi, ilock); + sb_end_intwrite(inode->i_sb); no_delete: clear_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 1a49b881bac0..64c07169df05 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -15,8 +15,10 @@ #include <linux/ctype.h> #include "f2fs.h" +#include "node.h" #include "xattr.h" #include "acl.h" +#include <trace/events/f2fs.h> static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) { @@ -25,19 +27,19 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_t ino; struct inode *inode; bool nid_free = false; - int err; + int err, ilock; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); - mutex_lock_op(sbi, NODE_NEW); + ilock = mutex_lock_op(sbi); if (!alloc_nid(sbi, &ino)) { - mutex_unlock_op(sbi, NODE_NEW); + mutex_unlock_op(sbi, ilock); err = -ENOSPC; goto fail; } - mutex_unlock_op(sbi, NODE_NEW); + mutex_unlock_op(sbi, ilock); inode->i_uid = current_fsuid(); @@ -61,7 +63,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_free = true; goto out; } - + trace_f2fs_new_inode(inode, 0); mark_inode_dirty(inode); return inode; @@ -69,6 +71,8 @@ out: clear_nlink(inode); unlock_new_inode(inode); fail: + trace_f2fs_new_inode(inode, err); + make_bad_inode(inode); iput(inode); if (nid_free) alloc_nid_failed(sbi, ino); @@ -82,7 +86,7 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) int ret; if (sublen > slen) - return 1; + return 0; ret = memcmp(s + slen - sublen, sub, sublen); if (ret) { /* compare upper case */ @@ -90,16 +94,16 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) char upper_sub[8]; for (i = 0; i < sublen && i < sizeof(upper_sub); i++) upper_sub[i] = toupper(sub[i]); - return memcmp(s + slen - sublen, upper_sub, sublen); + return !memcmp(s + slen - sublen, upper_sub, sublen); } - return ret; + return !ret; } /* * Set multimedia files as cold files for hot/cold data separation */ -static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode, +static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) { int i; @@ -107,8 +111,8 @@ static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode, int count = le32_to_cpu(sbi->raw_super->extension_count); for (i = 0; i < count; i++) { - if (!is_multimedia_file(name, extlist[i])) { - F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT; + if (is_multimedia_file(name, extlist[i])) { + file_set_cold(inode); break; } } @@ -121,7 +125,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; nid_t ino = 0; - int err; + int err, ilock; f2fs_balance_fs(sbi); @@ -130,26 +134,28 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, return PTR_ERR(inode); if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) - set_cold_file(sbi, inode, dentry->d_name.name); + set_cold_files(sbi, inode, dentry->d_name.name); inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; + ilock = mutex_lock_op(sbi); err = f2fs_add_link(dentry, inode); + mutex_unlock_op(sbi, ilock); if (err) goto out; alloc_nid_done(sbi, ino); - if (!sbi->por_doing) - d_instantiate(dentry, inode); + d_instantiate(dentry, inode); unlock_new_inode(inode); return 0; out: clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, ino); return err; @@ -161,15 +167,17 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct inode *inode = old_dentry->d_inode; struct super_block *sb = dir->i_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); - int err; + int err, ilock; f2fs_balance_fs(sbi); inode->i_ctime = CURRENT_TIME; - atomic_inc(&inode->i_count); + ihold(inode); set_inode_flag(F2FS_I(inode), FI_INC_LINK); + ilock = mutex_lock_op(sbi); err = f2fs_add_link(dentry, inode); + mutex_unlock_op(sbi, ilock); if (err) goto out; @@ -197,7 +205,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, struct f2fs_dir_entry *de; struct page *page; - if (dentry->d_name.len > F2FS_MAX_NAME_LEN) + if (dentry->d_name.len > F2FS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); de = f2fs_find_entry(dir, &dentry->d_name, &page); @@ -222,7 +230,9 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) struct f2fs_dir_entry *de; struct page *page; int err = -ENOENT; + int ilock; + trace_f2fs_unlink_enter(dir, dentry); f2fs_balance_fs(sbi); de = f2fs_find_entry(dir, &dentry->d_name, &page); @@ -236,11 +246,14 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } + ilock = mutex_lock_op(sbi); f2fs_delete_entry(de, page, inode); + mutex_unlock_op(sbi, ilock); /* In order to evict this inode, we set it dirty */ mark_inode_dirty(inode); fail: + trace_f2fs_unlink_exit(inode, err); return err; } @@ -251,7 +264,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; size_t symlen = strlen(symname) + 1; - int err; + int err, ilock; f2fs_balance_fs(sbi); @@ -262,7 +275,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &f2fs_symlink_inode_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; + ilock = mutex_lock_op(sbi); err = f2fs_add_link(dentry, inode); + mutex_unlock_op(sbi, ilock); if (err) goto out; @@ -275,6 +290,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, out: clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, inode->i_ino); return err; @@ -284,7 +300,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode; - int err; + int err, ilock; f2fs_balance_fs(sbi); @@ -298,7 +314,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); set_inode_flag(F2FS_I(inode), FI_INC_LINK); + ilock = mutex_lock_op(sbi); err = f2fs_add_link(dentry, inode); + mutex_unlock_op(sbi, ilock); if (err) goto out_fail; @@ -313,6 +331,7 @@ out_fail: clear_inode_flag(F2FS_I(inode), FI_INC_LINK); clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, inode->i_ino); return err; @@ -333,6 +352,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; int err = 0; + int ilock; if (!new_valid_dev(rdev)) return -EINVAL; @@ -346,7 +366,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; + ilock = mutex_lock_op(sbi); err = f2fs_add_link(dentry, inode); + mutex_unlock_op(sbi, ilock); if (err) goto out; @@ -357,6 +379,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, out: clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, inode->i_ino); return err; @@ -374,7 +397,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; - int err = -ENOENT; + int err = -ENOENT, ilock = -1; f2fs_balance_fs(sbi); @@ -389,7 +412,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_old; } - mutex_lock_op(sbi, RENAME); + ilock = mutex_lock_op(sbi); if (new_inode) { struct page *new_page; @@ -412,7 +435,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, drop_nlink(new_inode); if (!new_inode->i_nlink) add_orphan_inode(sbi, new_inode->i_ino); - f2fs_write_inode(new_inode, NULL); + update_inode_page(new_inode); } else { err = f2fs_add_link(new_dentry, old_inode); if (err) @@ -420,12 +443,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir_entry) { inc_nlink(new_dir); - f2fs_write_inode(new_dir, NULL); + update_inode_page(new_dir); } } old_inode->i_ctime = CURRENT_TIME; - set_inode_flag(F2FS_I(old_inode), FI_NEED_CP); mark_inode_dirty(old_inode); f2fs_delete_entry(old_entry, old_page, NULL); @@ -439,10 +461,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_put_page(old_dir_page, 0); } drop_nlink(old_dir); - f2fs_write_inode(old_dir, NULL); + update_inode_page(old_dir); } - mutex_unlock_op(sbi, RENAME); + mutex_unlock_op(sbi, ilock); return 0; out_dir: @@ -450,7 +472,7 @@ out_dir: kunmap(old_dir_page); f2fs_put_page(old_dir_page, 0); } - mutex_unlock_op(sbi, RENAME); + mutex_unlock_op(sbi, ilock); out_old: kunmap(old_page); f2fs_put_page(old_page, 0); @@ -468,6 +490,7 @@ const struct inode_operations f2fs_dir_inode_operations = { .rmdir = f2fs_rmdir, .mknod = f2fs_mknod, .rename = f2fs_rename, + .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, #ifdef CONFIG_F2FS_FS_XATTR @@ -482,6 +505,7 @@ const struct inode_operations f2fs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, + .getattr = f2fs_getattr, .setattr = f2fs_setattr, #ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, @@ -492,6 +516,7 @@ const struct inode_operations f2fs_symlink_inode_operations = { }; const struct inode_operations f2fs_special_inode_operations = { + .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, #ifdef CONFIG_F2FS_FS_XATTR diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e275218904ed..b418aee09573 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -19,6 +19,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include <trace/events/f2fs.h> static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; @@ -88,10 +89,13 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) { struct address_space *mapping = sbi->meta_inode->i_mapping; struct f2fs_nm_info *nm_i = NM_I(sbi); + struct blk_plug plug; struct page *page; pgoff_t index; int i; + blk_start_plug(&plug); + for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { if (nid >= nm_i->max_nid) nid = 0; @@ -100,12 +104,16 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) page = grab_cache_page(mapping, index); if (!page) continue; - if (f2fs_readpage(sbi, page, index, READ)) { + if (PageUptodate(page)) { f2fs_put_page(page, 1); continue; } + if (f2fs_readpage(sbi, page, index, READ)) + continue; + f2fs_put_page(page, 0); } + blk_finish_plug(&plug); } static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) @@ -236,7 +244,7 @@ static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (nm_i->nat_cnt < 2 * NM_WOUT_THRESHOLD) + if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD) return 0; write_lock(&nm_i->nat_tree_lock); @@ -320,15 +328,14 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[0] = 0; if (block < direct_index) { - offset[n++] = block; - level = 0; + offset[n] = block; goto got; } block -= direct_index; if (block < direct_blks) { offset[n++] = NODE_DIR1_BLOCK; noffset[n] = 1; - offset[n++] = block; + offset[n] = block; level = 1; goto got; } @@ -336,7 +343,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) if (block < direct_blks) { offset[n++] = NODE_DIR2_BLOCK; noffset[n] = 2; - offset[n++] = block; + offset[n] = block; level = 1; goto got; } @@ -346,7 +353,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[n] = 3; offset[n++] = block / direct_blks; noffset[n] = 4 + offset[n - 1]; - offset[n++] = block % direct_blks; + offset[n] = block % direct_blks; level = 2; goto got; } @@ -356,7 +363,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[n] = 4 + dptrs_per_blk; offset[n++] = block / direct_blks; noffset[n] = 5 + dptrs_per_blk + offset[n - 1]; - offset[n++] = block % direct_blks; + offset[n] = block % direct_blks; level = 2; goto got; } @@ -371,7 +378,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[n] = 7 + (dptrs_per_blk * 2) + offset[n - 2] * (dptrs_per_blk + 1) + offset[n - 1]; - offset[n++] = block % direct_blks; + offset[n] = block % direct_blks; level = 3; goto got; } else { @@ -383,8 +390,11 @@ got: /* * Caller should call f2fs_put_dnode(dn). + * Also, it should grab and release a mutex by calling mutex_lock_op() and + * mutex_unlock_op() only if ro is not set RDONLY_NODE. + * In the case of RDONLY_NODE, we don't need to care about mutex. */ -int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) +int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) { struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct page *npage[4]; @@ -398,12 +408,16 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) level = get_node_path(index, offset, noffset); nids[0] = dn->inode->i_ino; - npage[0] = get_node_page(sbi, nids[0]); - if (IS_ERR(npage[0])) - return PTR_ERR(npage[0]); + npage[0] = dn->inode_page; + if (!npage[0]) { + npage[0] = get_node_page(sbi, nids[0]); + if (IS_ERR(npage[0])) + return PTR_ERR(npage[0]); + } parent = npage[0]; - nids[1] = get_nid(parent, offset[0], true); + if (level != 0) + nids[1] = get_nid(parent, offset[0], true); dn->inode_page = npage[0]; dn->inode_page_locked = true; @@ -411,30 +425,25 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) for (i = 1; i <= level; i++) { bool done = false; - if (!nids[i] && !ro) { - mutex_lock_op(sbi, NODE_NEW); - + if (!nids[i] && mode == ALLOC_NODE) { /* alloc new node */ if (!alloc_nid(sbi, &(nids[i]))) { - mutex_unlock_op(sbi, NODE_NEW); err = -ENOSPC; goto release_pages; } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i]); + npage[i] = new_node_page(dn, noffset[i], NULL); if (IS_ERR(npage[i])) { alloc_nid_failed(sbi, nids[i]); - mutex_unlock_op(sbi, NODE_NEW); err = PTR_ERR(npage[i]); goto release_pages; } set_nid(parent, offset[i - 1], nids[i], i == 1); alloc_nid_done(sbi, nids[i]); - mutex_unlock_op(sbi, NODE_NEW); done = true; - } else if (ro && i == level && level > 1) { + } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) { npage[i] = get_node_page_ra(parent, offset[i - 1]); if (IS_ERR(npage[i])) { err = PTR_ERR(npage[i]); @@ -507,6 +516,7 @@ invalidate: f2fs_put_page(dn->node_page, 1); dn->node_page = NULL; + trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); } static int truncate_dnode(struct dnode_of_data *dn) @@ -547,9 +557,13 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, if (dn->nid == 0) return NIDS_PER_BLOCK + 1; + trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); + page = get_node_page(sbi, dn->nid); - if (IS_ERR(page)) + if (IS_ERR(page)) { + trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); return PTR_ERR(page); + } rn = (struct f2fs_node *)page_address(page); if (depth < 3) { @@ -591,10 +605,12 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, } else { f2fs_put_page(page, 1); } + trace_f2fs_truncate_nodes_exit(dn->inode, freed); return freed; out_err: f2fs_put_page(page, 1); + trace_f2fs_truncate_nodes_exit(dn->inode, ret); return ret; } @@ -649,6 +665,9 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, fail: for (i = depth - 3; i >= 0; i--) f2fs_put_page(pages[i], 1); + + trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err); + return err; } @@ -658,6 +677,7 @@ fail: int truncate_inode_blocks(struct inode *inode, pgoff_t from) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct address_space *node_mapping = sbi->node_inode->i_mapping; int err = 0, cont = 1; int level, offset[4], noffset[4]; unsigned int nofs = 0; @@ -665,11 +685,15 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) struct dnode_of_data dn; struct page *page; - level = get_node_path(from, offset, noffset); + trace_f2fs_truncate_inode_blocks_enter(inode, from); + level = get_node_path(from, offset, noffset); +restart: page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) + if (IS_ERR(page)) { + trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); return PTR_ERR(page); + } set_new_dnode(&dn, inode, page, NULL, 0); unlock_page(page); @@ -728,6 +752,10 @@ skip_partial: if (offset[1] == 0 && rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); + if (page->mapping != node_mapping) { + f2fs_put_page(page, 1); + goto restart; + } wait_on_page_writeback(page); rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); @@ -739,9 +767,14 @@ skip_partial: } fail: f2fs_put_page(page, 0); + trace_f2fs_truncate_inode_blocks_exit(inode, err); return err > 0 ? 0 : err; } +/* + * Caller should grab and release a mutex by calling mutex_lock_op() and + * mutex_unlock_op(). + */ int remove_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); @@ -749,21 +782,16 @@ int remove_inode_page(struct inode *inode) nid_t ino = inode->i_ino; struct dnode_of_data dn; - mutex_lock_op(sbi, NODE_TRUNC); page = get_node_page(sbi, ino); - if (IS_ERR(page)) { - mutex_unlock_op(sbi, NODE_TRUNC); + if (IS_ERR(page)) return PTR_ERR(page); - } if (F2FS_I(inode)->i_xattr_nid) { nid_t nid = F2FS_I(inode)->i_xattr_nid; struct page *npage = get_node_page(sbi, nid); - if (IS_ERR(npage)) { - mutex_unlock_op(sbi, NODE_TRUNC); + if (IS_ERR(npage)) return PTR_ERR(npage); - } F2FS_I(inode)->i_xattr_nid = 0; set_new_dnode(&dn, inode, page, npage, nid); @@ -775,30 +803,22 @@ int remove_inode_page(struct inode *inode) BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1); set_new_dnode(&dn, inode, page, page, ino); truncate_node(&dn); - - mutex_unlock_op(sbi, NODE_TRUNC); return 0; } -int new_inode_page(struct inode *inode, const struct qstr *name) +struct page *new_inode_page(struct inode *inode, const struct qstr *name) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct page *page; struct dnode_of_data dn; /* allocate inode page for new inode */ set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); - mutex_lock_op(sbi, NODE_NEW); - page = new_node_page(&dn, 0); - init_dent_inode(name, page); - mutex_unlock_op(sbi, NODE_NEW); - if (IS_ERR(page)) - return PTR_ERR(page); - f2fs_put_page(page, 1); - return 0; + + /* caller should f2fs_put_page(page, 1); */ + return new_node_page(&dn, 0, NULL); } -struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) +struct page *new_node_page(struct dnode_of_data *dn, + unsigned int ofs, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct address_space *mapping = sbi->node_inode->i_mapping; @@ -831,7 +851,10 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) set_cold_node(dn->inode, page); dn->node_page = page; - sync_inode_page(dn); + if (ipage) + update_inode(dn->inode, ipage); + else + sync_inode_page(dn); set_page_dirty(page); if (ofs == 0) inc_valid_inode_count(sbi); @@ -844,6 +867,12 @@ fail: return ERR_PTR(err); } +/* + * Caller should do after getting the following values. + * 0: f2fs_put_page(page, 0) + * LOCKED_PAGE: f2fs_put_page(page, 1) + * error: nothing + */ static int read_node_page(struct page *page, int type) { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); @@ -851,8 +880,14 @@ static int read_node_page(struct page *page, int type) get_node_info(sbi, page->index, &ni); - if (ni.blk_addr == NULL_ADDR) + if (ni.blk_addr == NULL_ADDR) { + f2fs_put_page(page, 1); return -ENOENT; + } + + if (PageUptodate(page)) + return LOCKED_PAGE; + return f2fs_readpage(sbi, page, ni.blk_addr, type); } @@ -863,40 +898,53 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) { struct address_space *mapping = sbi->node_inode->i_mapping; struct page *apage; + int err; apage = find_get_page(mapping, nid); - if (apage && PageUptodate(apage)) - goto release_out; + if (apage && PageUptodate(apage)) { + f2fs_put_page(apage, 0); + return; + } f2fs_put_page(apage, 0); apage = grab_cache_page(mapping, nid); if (!apage) return; - if (read_node_page(apage, READA)) - unlock_page(apage); - -release_out: - f2fs_put_page(apage, 0); + err = read_node_page(apage, READA); + if (err == 0) + f2fs_put_page(apage, 0); + else if (err == LOCKED_PAGE) + f2fs_put_page(apage, 1); return; } struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - int err; - struct page *page; struct address_space *mapping = sbi->node_inode->i_mapping; - + struct page *page; + int err; +repeat: page = grab_cache_page(mapping, nid); if (!page) return ERR_PTR(-ENOMEM); err = read_node_page(page, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); + if (err < 0) return ERR_PTR(err); - } + else if (err == LOCKED_PAGE) + goto got_it; + lock_page(page); + if (!PageUptodate(page)) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + if (page->mapping != mapping) { + f2fs_put_page(page, 1); + goto repeat; + } +got_it: BUG_ON(nid != nid_of_node(page)); mark_page_accessed(page); return page; @@ -910,31 +958,27 @@ struct page *get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); struct address_space *mapping = sbi->node_inode->i_mapping; - int i, end; - int err = 0; - nid_t nid; + struct blk_plug plug; struct page *page; + int err, i, end; + nid_t nid; /* First, try getting the desired direct node. */ nid = get_nid(parent, start, false); if (!nid) return ERR_PTR(-ENOENT); - - page = find_get_page(mapping, nid); - if (page && PageUptodate(page)) - goto page_hit; - f2fs_put_page(page, 0); - repeat: page = grab_cache_page(mapping, nid); if (!page) return ERR_PTR(-ENOMEM); - err = read_node_page(page, READA); - if (err) { - f2fs_put_page(page, 1); + err = read_node_page(page, READ_SYNC); + if (err < 0) return ERR_PTR(err); - } + else if (err == LOCKED_PAGE) + goto page_hit; + + blk_start_plug(&plug); /* Then, try readahead for siblings of the desired node */ end = start + MAX_RA_NODE; @@ -946,18 +990,19 @@ repeat: ra_node_page(sbi, nid); } -page_hit: - lock_page(page); - if (PageError(page)) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } + blk_finish_plug(&plug); - /* Has the page been truncated? */ + lock_page(page); if (page->mapping != mapping) { f2fs_put_page(page, 1); goto repeat; } +page_hit: + if (!PageUptodate(page)) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + mark_page_accessed(page); return page; } @@ -972,7 +1017,7 @@ void sync_inode_page(struct dnode_of_data *dn) if (!dn->inode_page_locked) unlock_page(dn->inode_page); } else { - f2fs_write_inode(dn->inode, NULL); + update_inode_page(dn->inode); } } @@ -1087,17 +1132,8 @@ static int f2fs_write_node_page(struct page *page, block_t new_addr; struct node_info ni; - if (wbc->for_reclaim) { - dec_page_count(sbi, F2FS_DIRTY_NODES); - wbc->pages_skipped++; - set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; - } - wait_on_page_writeback(page); - mutex_lock_op(sbi, NODE_WRITE); - /* get old block addr of this node page */ nid = nid_of_node(page); BUG_ON(page->index != nid); @@ -1105,17 +1141,25 @@ static int f2fs_write_node_page(struct page *page, get_node_info(sbi, nid, &ni); /* This page is already truncated */ - if (ni.blk_addr == NULL_ADDR) + if (ni.blk_addr == NULL_ADDR) { + dec_page_count(sbi, F2FS_DIRTY_NODES); + unlock_page(page); return 0; + } - set_page_writeback(page); + if (wbc->for_reclaim) { + dec_page_count(sbi, F2FS_DIRTY_NODES); + wbc->pages_skipped++; + set_page_dirty(page); + return AOP_WRITEPAGE_ACTIVATE; + } - /* insert node offset */ + mutex_lock(&sbi->node_write); + set_page_writeback(page); write_node_page(sbi, page, nid, ni.blk_addr, &new_addr); set_node_addr(sbi, &ni, new_addr); dec_page_count(sbi, F2FS_DIRTY_NODES); - - mutex_unlock_op(sbi, NODE_WRITE); + mutex_unlock(&sbi->node_write); unlock_page(page); return 0; } @@ -1130,12 +1174,11 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - struct block_device *bdev = sbi->sb->s_bdev; long nr_to_write = wbc->nr_to_write; /* First check balancing cached NAT entries */ if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) { - write_checkpoint(sbi, false); + f2fs_sync_fs(sbi->sb, true); return 0; } @@ -1144,10 +1187,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, return 0; /* if mounting is failed, skip writing node pages */ - wbc->nr_to_write = bio_get_nr_vecs(bdev); + wbc->nr_to_write = max_hw_blocks(sbi); sync_node_pages(sbi, 0, wbc); - wbc->nr_to_write = nr_to_write - - (bio_get_nr_vecs(bdev) - wbc->nr_to_write); + wbc->nr_to_write = nr_to_write - (max_hw_blocks(sbi) - wbc->nr_to_write); return 0; } @@ -1166,7 +1208,8 @@ static int f2fs_set_node_page_dirty(struct page *page) return 0; } -static void f2fs_invalidate_node_page(struct page *page, unsigned long offset) +static void f2fs_invalidate_node_page(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); @@ -1178,7 +1221,7 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned long offset) static int f2fs_release_node_page(struct page *page, gfp_t wait) { ClearPagePrivate(page); - return 0; + return 1; } /* @@ -1195,14 +1238,13 @@ const struct address_space_operations f2fs_node_aops = { static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head) { struct list_head *this; - struct free_nid *i = NULL; + struct free_nid *i; list_for_each(this, head) { i = list_entry(this, struct free_nid, list); if (i->nid == n) - break; - i = NULL; + return i; } - return i; + return NULL; } static void __del_from_free_nid_list(struct free_nid *i) @@ -1211,11 +1253,29 @@ static void __del_from_free_nid_list(struct free_nid *i) kmem_cache_free(free_nid_slab, i); } -static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) { struct free_nid *i; + struct nat_entry *ne; + bool allocated = false; if (nm_i->fcnt > 2 * MAX_FREE_NIDS) + return -1; + + /* 0 nid should not be used */ + if (nid == 0) + return 0; + + if (!build) + goto retry; + + /* do not add allocated nids */ + read_lock(&nm_i->nat_tree_lock); + ne = __lookup_nat_cache(nm_i, nid); + if (ne && nat_get_blkaddr(ne) != NULL_ADDR) + allocated = true; + read_unlock(&nm_i->nat_tree_lock); + if (allocated) return 0; retry: i = kmem_cache_alloc(free_nid_slab, GFP_NOFS); @@ -1250,63 +1310,59 @@ static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) spin_unlock(&nm_i->free_nid_list_lock); } -static int scan_nat_page(struct f2fs_nm_info *nm_i, +static void scan_nat_page(struct f2fs_nm_info *nm_i, struct page *nat_page, nid_t start_nid) { struct f2fs_nat_block *nat_blk = page_address(nat_page); block_t blk_addr; - int fcnt = 0; int i; - /* 0 nid should not be used */ - if (start_nid == 0) - ++start_nid; - i = start_nid % NAT_ENTRY_PER_BLOCK; for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { - blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); + + if (start_nid >= nm_i->max_nid) + break; + + blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); BUG_ON(blk_addr == NEW_ADDR); - if (blk_addr == NULL_ADDR) - fcnt += add_free_nid(nm_i, start_nid); + if (blk_addr == NULL_ADDR) { + if (add_free_nid(nm_i, start_nid, true) < 0) + break; + } } - return fcnt; } static void build_free_nids(struct f2fs_sb_info *sbi) { - struct free_nid *fnid, *next_fnid; struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - nid_t nid = 0; - bool is_cycled = false; - int fcnt = 0; - int i; + int i = 0; + nid_t nid = nm_i->next_scan_nid; - nid = nm_i->next_scan_nid; - nm_i->init_scan_nid = nid; + /* Enough entries */ + if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK) + return; + /* readahead nat pages to be scanned */ ra_nat_pages(sbi, nid); while (1) { struct page *page = get_current_nat_page(sbi, nid); - fcnt += scan_nat_page(nm_i, page, nid); + scan_nat_page(nm_i, page, nid); f2fs_put_page(page, 1); nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); - - if (nid >= nm_i->max_nid) { + if (nid >= nm_i->max_nid) nid = 0; - is_cycled = true; - } - if (fcnt > MAX_FREE_NIDS) - break; - if (is_cycled && nm_i->init_scan_nid <= nid) + + if (i++ == FREE_NID_PAGES) break; } + /* go to the next free nat pages to find free nids abundantly */ nm_i->next_scan_nid = nid; /* find free nids from current sum_pages */ @@ -1315,22 +1371,11 @@ static void build_free_nids(struct f2fs_sb_info *sbi) block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); nid = le32_to_cpu(nid_in_journal(sum, i)); if (addr == NULL_ADDR) - add_free_nid(nm_i, nid); + add_free_nid(nm_i, nid, true); else remove_free_nid(nm_i, nid); } mutex_unlock(&curseg->curseg_mutex); - - /* remove the free nids from current allocated nids */ - list_for_each_entry_safe(fnid, next_fnid, &nm_i->free_nid_list, list) { - struct nat_entry *ne; - - read_lock(&nm_i->nat_tree_lock); - ne = __lookup_nat_cache(nm_i, fnid->nid); - if (ne && nat_get_blkaddr(ne) != NULL_ADDR) - remove_free_nid(nm_i, fnid->nid); - read_unlock(&nm_i->nat_tree_lock); - } } /* @@ -1344,41 +1389,36 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; struct list_head *this; retry: - mutex_lock(&nm_i->build_lock); - if (!nm_i->fcnt) { - /* scan NAT in order to build free nid list */ - build_free_nids(sbi); - if (!nm_i->fcnt) { - mutex_unlock(&nm_i->build_lock); - return false; - } - } - mutex_unlock(&nm_i->build_lock); + if (sbi->total_valid_node_count + 1 >= nm_i->max_nid) + return false; - /* - * We check fcnt again since previous check is racy as - * we didn't hold free_nid_list_lock. So other thread - * could consume all of free nids. - */ spin_lock(&nm_i->free_nid_list_lock); - if (!nm_i->fcnt) { - spin_unlock(&nm_i->free_nid_list_lock); - goto retry; - } - BUG_ON(list_empty(&nm_i->free_nid_list)); - list_for_each(this, &nm_i->free_nid_list) { - i = list_entry(this, struct free_nid, list); - if (i->state == NID_NEW) - break; - } + /* We should not use stale free nids created by build_free_nids */ + if (nm_i->fcnt && !sbi->on_build_free_nids) { + BUG_ON(list_empty(&nm_i->free_nid_list)); + list_for_each(this, &nm_i->free_nid_list) { + i = list_entry(this, struct free_nid, list); + if (i->state == NID_NEW) + break; + } - BUG_ON(i->state != NID_NEW); - *nid = i->nid; - i->state = NID_ALLOC; - nm_i->fcnt--; + BUG_ON(i->state != NID_NEW); + *nid = i->nid; + i->state = NID_ALLOC; + nm_i->fcnt--; + spin_unlock(&nm_i->free_nid_list_lock); + return true; + } spin_unlock(&nm_i->free_nid_list_lock); - return true; + + /* Let's scan nat pages and its caches to get free nids */ + mutex_lock(&nm_i->build_lock); + sbi->on_build_free_nids = 1; + build_free_nids(sbi); + sbi->on_build_free_nids = 0; + mutex_unlock(&nm_i->build_lock); + goto retry; } /* @@ -1391,10 +1431,8 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); - if (i) { - BUG_ON(i->state != NID_ALLOC); - __del_from_free_nid_list(i); - } + BUG_ON(!i || i->state != NID_ALLOC); + __del_from_free_nid_list(i); spin_unlock(&nm_i->free_nid_list_lock); } @@ -1403,8 +1441,19 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) */ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) { - alloc_nid_done(sbi, nid); - add_free_nid(NM_I(sbi), nid); + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i; + + spin_lock(&nm_i->free_nid_list_lock); + i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + BUG_ON(!i || i->state != NID_ALLOC); + if (nm_i->fcnt > 2 * MAX_FREE_NIDS) { + __del_from_free_nid_list(i); + } else { + i->state = NID_NEW; + nm_i->fcnt++; + } + spin_unlock(&nm_i->free_nid_list_lock); } void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, @@ -1447,9 +1496,10 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) new_ni = old_ni; new_ni.ino = ino; + if (!inc_valid_node_count(sbi, NULL, 1)) + WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR); inc_valid_inode_count(sbi); - f2fs_put_page(ipage, 1); return 0; } @@ -1475,23 +1525,24 @@ int restore_node_summary(struct f2fs_sb_info *sbi, sum_entry = &sum->entries[0]; for (i = 0; i < last_offset; i++, sum_entry++) { + /* + * In order to read next node page, + * we must clear PageUptodate flag. + */ + ClearPageUptodate(page); + if (f2fs_readpage(sbi, page, addr, READ_SYNC)) goto out; + lock_page(page); rn = (struct f2fs_node *)page_address(page); sum_entry->nid = rn->footer.nid; sum_entry->version = 0; sum_entry->ofs_in_node = 0; addr++; - - /* - * In order to read next node page, - * we must clear PageUptodate flag. - */ - ClearPageUptodate(page); } -out: unlock_page(page); +out: __free_pages(page, 0); return 0; } @@ -1614,13 +1665,11 @@ flush_now: nid_in_journal(sum, offset) = cpu_to_le32(nid); } - if (nat_get_blkaddr(ne) == NULL_ADDR) { + if (nat_get_blkaddr(ne) == NULL_ADDR && + add_free_nid(NM_I(sbi), nid, false) <= 0) { write_lock(&nm_i->nat_tree_lock); __del_from_nat_cache(nm_i, ne); write_unlock(&nm_i->nat_tree_lock); - - /* We can reuse this freed nid at this point */ - add_free_nid(NM_I(sbi), nid); } else { write_lock(&nm_i->nat_tree_lock); __clear_nat_cache_dirty(nm_i, ne); @@ -1661,19 +1710,16 @@ static int init_node_manager(struct f2fs_sb_info *sbi) spin_lock_init(&nm_i->free_nid_list_lock); rwlock_init(&nm_i->nat_tree_lock); - nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); - nm_i->init_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); - - nm_i->nat_bitmap = kzalloc(nm_i->bitmap_size, GFP_KERNEL); - if (!nm_i->nat_bitmap) - return -ENOMEM; + nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP); if (!version_bitmap) return -EFAULT; - /* copy version bitmap */ - memcpy(nm_i->nat_bitmap, version_bitmap, nm_i->bitmap_size); + nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap) + return -ENOMEM; return 0; } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index afdb130f782e..c65fb4f4230f 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -29,6 +29,9 @@ /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 +/* return value for read_node_page */ +#define LOCKED_PAGE 1 + /* * For node information */ @@ -239,7 +242,7 @@ static inline bool IS_DNODE(struct page *node_page) return false; if (ofs >= 6 + 2 * NIDS_PER_BLOCK) { ofs -= 6 + 2 * NIDS_PER_BLOCK; - if ((long int)ofs % (NIDS_PER_BLOCK + 1)) + if (!((long int)ofs % (NIDS_PER_BLOCK + 1))) return false; } return true; @@ -272,11 +275,28 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold node blocks in their node footer * - Mark cold data pages in page cache */ -static inline int is_cold_file(struct inode *inode) +static inline int is_file(struct inode *inode, int type) +{ + return F2FS_I(inode)->i_advise & type; +} + +static inline void set_file(struct inode *inode, int type) { - return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT; + F2FS_I(inode)->i_advise |= type; } +static inline void clear_file(struct inode *inode, int type) +{ + F2FS_I(inode)->i_advise &= ~type; +} + +#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) +#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) +#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) +#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) +#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) +#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) + static inline int is_cold_data(struct page *page) { return PageChecked(page); @@ -292,29 +312,16 @@ static inline void clear_cold_data(struct page *page) ClearPageChecked(page); } -static inline int is_cold_node(struct page *page) +static inline int is_node(struct page *page, int type) { void *kaddr = page_address(page); struct f2fs_node *rn = (struct f2fs_node *)kaddr; - unsigned int flag = le32_to_cpu(rn->footer.flag); - return flag & (0x1 << COLD_BIT_SHIFT); -} - -static inline unsigned char is_fsync_dnode(struct page *page) -{ - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - unsigned int flag = le32_to_cpu(rn->footer.flag); - return flag & (0x1 << FSYNC_BIT_SHIFT); + return le32_to_cpu(rn->footer.flag) & (1 << type); } -static inline unsigned char is_dent_dnode(struct page *page) -{ - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - unsigned int flag = le32_to_cpu(rn->footer.flag); - return flag & (0x1 << DENT_BIT_SHIFT); -} +#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT) +#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) +#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) static inline void set_cold_node(struct inode *inode, struct page *page) { @@ -328,26 +335,15 @@ static inline void set_cold_node(struct inode *inode, struct page *page) rn->footer.flag = cpu_to_le32(flag); } -static inline void set_fsync_mark(struct page *page, int mark) -{ - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - unsigned int flag = le32_to_cpu(rn->footer.flag); - if (mark) - flag |= (0x1 << FSYNC_BIT_SHIFT); - else - flag &= ~(0x1 << FSYNC_BIT_SHIFT); - rn->footer.flag = cpu_to_le32(flag); -} - -static inline void set_dentry_mark(struct page *page, int mark) +static inline void set_mark(struct page *page, int mark, int type) { - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = (struct f2fs_node *)page_address(page); unsigned int flag = le32_to_cpu(rn->footer.flag); if (mark) - flag |= (0x1 << DENT_BIT_SHIFT); + flag |= (0x1 << type); else - flag &= ~(0x1 << DENT_BIT_SHIFT); + flag &= ~(0x1 << type); rn->footer.flag = cpu_to_le32(flag); } +#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT) +#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b235215ac138..d56d951c2253 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -40,36 +40,54 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, static int recover_dentry(struct page *ipage, struct inode *inode) { - struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage); + void *kaddr = page_address(ipage); + struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; struct f2fs_inode *raw_inode = &(raw_node->i); - struct qstr name; + nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; + struct qstr name; struct page *page; - struct inode *dir; + struct inode *dir, *einode; int err = 0; - if (!is_dent_dnode(ipage)) - goto out; - - dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino)); - if (IS_ERR(dir)) { - err = -EINVAL; - goto out; + dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino); + if (!dir) { + dir = f2fs_iget(inode->i_sb, pino); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto out; + } + set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); + add_dirty_dir_inode(dir); } name.len = le32_to_cpu(raw_inode->i_namelen); name.name = raw_inode->i_name; - +retry: de = f2fs_find_entry(dir, &name, &page); - if (de) { + if (de && inode->i_ino == le32_to_cpu(de->ino)) { kunmap(page); f2fs_put_page(page, 0); - } else { - err = __f2fs_add_link(dir, &name, inode); + goto out; + } + if (de) { + einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); + if (IS_ERR(einode)) { + WARN_ON(1); + if (PTR_ERR(einode) == -ENOENT) + err = -EEXIST; + goto out; + } + f2fs_delete_entry(de, page, einode); + iput(einode); + goto retry; } - iput(dir); + err = __f2fs_add_link(dir, &name, inode); out: - kunmap(ipage); + f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: " + "ino = %x, name = %s, dir = %lx, err = %d", + ino_of_node(ipage), raw_inode->i_name, + IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } @@ -79,6 +97,9 @@ static int recover_inode(struct inode *inode, struct page *node_page) struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; struct f2fs_inode *raw_inode = &(raw_node->i); + if (!IS_INODE(node_page)) + return 0; + inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_size_write(inode, le64_to_cpu(raw_inode->i_size)); inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); @@ -88,7 +109,12 @@ static int recover_inode(struct inode *inode, struct page *node_page) inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); - return recover_dentry(node_page, inode); + if (is_dent_dnode(node_page)) + return recover_dentry(node_page, inode); + + f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s", + ino_of_node(node_page), raw_inode->i_name); + return 0; } static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) @@ -112,64 +138,61 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) while (1) { struct fsync_inode_entry *entry; - if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC)) + err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); + if (err) goto out; + lock_page(page); + if (cp_ver != cpver_of_node(page)) - goto out; + break; if (!is_fsync_dnode(page)) goto next; entry = get_fsync_inode(head, ino_of_node(page)); if (entry) { - entry->blkaddr = blkaddr; if (IS_INODE(page) && is_dent_dnode(page)) set_inode_flag(F2FS_I(entry->inode), FI_INC_LINK); } else { if (IS_INODE(page) && is_dent_dnode(page)) { - if (recover_inode_page(sbi, page)) { - err = -ENOMEM; - goto out; - } + err = recover_inode_page(sbi, page); + if (err) + break; } /* add this fsync inode to the list */ entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS); if (!entry) { err = -ENOMEM; - goto out; + break; } entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); if (IS_ERR(entry->inode)) { err = PTR_ERR(entry->inode); kmem_cache_free(fsync_entry_slab, entry); - goto out; + break; } - list_add_tail(&entry->list, head); - entry->blkaddr = blkaddr; - } - if (IS_INODE(page)) { - err = recover_inode(entry->inode, page); - if (err) - goto out; } + entry->blkaddr = blkaddr; + + err = recover_inode(entry->inode, page); + if (err && err != -ENOENT) + break; next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); - ClearPageUptodate(page); } -out: unlock_page(page); +out: __free_pages(page, 0); return err; } -static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi, - struct list_head *head) +static void destroy_fsync_dnodes(struct list_head *head) { struct fsync_inode_entry *entry, *tmp; @@ -180,15 +203,15 @@ static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi, } } -static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, - block_t blkaddr) +static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, + block_t blkaddr, struct dnode_of_data *dn) { struct seg_entry *sentry; unsigned int segno = GET_SEGNO(sbi, blkaddr); unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1); struct f2fs_summary sum; - nid_t ino; + nid_t ino, nid; void *kaddr; struct inode *inode; struct page *node_page; @@ -197,7 +220,7 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, sentry = get_seg_entry(sbi, segno); if (!f2fs_test_bit(blkoff, sentry->cur_valid_map)) - return; + return 0; /* Get the previous summary */ for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { @@ -216,29 +239,50 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, f2fs_put_page(sum_page, 1); } + /* Use the locked dnode page and inode */ + nid = le32_to_cpu(sum.nid); + if (dn->inode->i_ino == nid) { + struct dnode_of_data tdn = *dn; + tdn.nid = nid; + tdn.node_page = dn->inode_page; + tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + truncate_data_blocks_range(&tdn, 1); + return 0; + } else if (dn->nid == nid) { + struct dnode_of_data tdn = *dn; + tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + truncate_data_blocks_range(&tdn, 1); + return 0; + } + /* Get the node page */ - node_page = get_node_page(sbi, le32_to_cpu(sum.nid)); + node_page = get_node_page(sbi, nid); + if (IS_ERR(node_page)) + return PTR_ERR(node_page); bidx = start_bidx_of_node(ofs_of_node(node_page)) + - le16_to_cpu(sum.ofs_in_node); + le16_to_cpu(sum.ofs_in_node); ino = ino_of_node(node_page); f2fs_put_page(node_page, 1); /* Deallocate previous index in the node page */ inode = f2fs_iget(sbi->sb, ino); if (IS_ERR(inode)) - return; + return PTR_ERR(inode); truncate_hole(inode, bidx, bidx + 1); iput(inode); + return 0; } -static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, +static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, block_t blkaddr) { unsigned int start, end; struct dnode_of_data dn; struct f2fs_summary sum; struct node_info ni; + int err = 0, recovered = 0; + int ilock; start = start_bidx_of_node(ofs_of_node(page)); if (IS_INODE(page)) @@ -246,9 +290,14 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, else end = start + ADDRS_PER_BLOCK; + ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, start, 0)) - return; + + err = get_dnode_of_data(&dn, start, ALLOC_NODE); + if (err) { + mutex_unlock_op(sbi, ilock); + return err; + } wait_on_page_writeback(dn.node_page); @@ -270,13 +319,16 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, } /* Check the previous node page having this index */ - check_index_in_prev_nodes(sbi, dest); + err = check_index_in_prev_nodes(sbi, dest, &dn); + if (err) + goto err; set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); /* write dummy data page */ recover_data_page(sbi, NULL, &sum, src, dest); update_extent_cache(dest, &dn); + recovered++; } dn.ofs_in_node++; } @@ -292,15 +344,23 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, set_page_dirty(dn.node_page); recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr); +err: f2fs_put_dnode(&dn); + mutex_unlock_op(sbi, ilock); + + f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, " + "recovered_data = %d blocks, err = %d", + inode->i_ino, recovered, err); + return err; } -static void recover_data(struct f2fs_sb_info *sbi, +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head, int type) { unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); struct curseg_info *curseg; struct page *page; + int err = 0; block_t blkaddr; /* get node pages in the current segment */ @@ -310,23 +370,29 @@ static void recover_data(struct f2fs_sb_info *sbi, /* read node page */ page = alloc_page(GFP_NOFS | __GFP_ZERO); if (IS_ERR(page)) - return; + return -ENOMEM; + lock_page(page); while (1) { struct fsync_inode_entry *entry; - if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC)) + err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); + if (err) goto out; + lock_page(page); + if (cp_ver != cpver_of_node(page)) - goto out; + break; entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) goto next; - do_recover_data(sbi, entry->inode, page, blkaddr); + err = do_recover_data(sbi, entry->inode, page, blkaddr); + if (err) + break; if (entry->blkaddr == blkaddr) { iput(entry->inode); @@ -336,40 +402,45 @@ static void recover_data(struct f2fs_sb_info *sbi, next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); - ClearPageUptodate(page); } -out: unlock_page(page); +out: __free_pages(page, 0); - allocate_new_segments(sbi); + if (!err) + allocate_new_segments(sbi); + return err; } -void recover_fsync_data(struct f2fs_sb_info *sbi) +int recover_fsync_data(struct f2fs_sb_info *sbi) { struct list_head inode_list; + int err; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry), NULL); if (unlikely(!fsync_entry_slab)) - return; + return -ENOMEM; INIT_LIST_HEAD(&inode_list); /* step #1: find fsynced inode numbers */ - if (find_fsync_dnodes(sbi, &inode_list)) + sbi->por_doing = 1; + err = find_fsync_dnodes(sbi, &inode_list); + if (err) goto out; if (list_empty(&inode_list)) goto out; /* step #2: recover data */ - sbi->por_doing = 1; - recover_data(sbi, &inode_list, CURSEG_WARM_NODE); - sbi->por_doing = 0; + err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); BUG_ON(!list_empty(&inode_list)); out: - destroy_fsync_dnodes(sbi, &inode_list); + destroy_fsync_dnodes(&inode_list); kmem_cache_destroy(fsync_entry_slab); - write_checkpoint(sbi, false); + sbi->por_doing = 0; + if (!err) + write_checkpoint(sbi, false); + return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 777f17e496e6..a86d125a9885 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -18,6 +18,7 @@ #include "f2fs.h" #include "segment.h" #include "node.h" +#include <trace/events/f2fs.h> /* * This function balances dirty node and dentry pages. @@ -49,9 +50,20 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); + enum dirty_type t = DIRTY_HOT_DATA; + dirty_type = sentry->type; + if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]++; + + /* Only one bitmap should be set */ + for (; t <= DIRTY_COLD_NODE; t++) { + if (t == dirty_type) + continue; + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]--; + } } } @@ -64,13 +76,16 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, dirty_i->nr_dirty[dirty_type]--; if (dirty_type == DIRTY) { - struct seg_entry *sentry = get_seg_entry(sbi, segno); - dirty_type = sentry->type; - if (test_and_clear_bit(segno, - dirty_i->dirty_segmap[dirty_type])) - dirty_i->nr_dirty[dirty_type]--; - clear_bit(segno, dirty_i->victim_segmap[FG_GC]); - clear_bit(segno, dirty_i->victim_segmap[BG_GC]); + enum dirty_type t = DIRTY_HOT_DATA; + + /* clear all the bitmaps */ + for (; t <= DIRTY_COLD_NODE; t++) + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]--; + + if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) + clear_bit(GET_SECNO(sbi, segno), + dirty_i->victim_secmap); } } @@ -79,7 +94,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, * Adding dirty entry into seglist is not critical operation. * If a given segment is one of current working segments, it won't be added. */ -void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) +static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned short valid_blocks; @@ -111,17 +126,16 @@ void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno, offset = 0; + unsigned int segno = -1; unsigned int total_segs = TOTAL_SEGS(sbi); mutex_lock(&dirty_i->seglist_lock); while (1) { segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, - offset); + segno + 1); if (segno >= total_segs) break; __set_test_and_free(sbi, segno); - offset = segno + 1; } mutex_unlock(&dirty_i->seglist_lock); } @@ -129,17 +143,16 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno, offset = 0; + unsigned int segno = -1; unsigned int total_segs = TOTAL_SEGS(sbi); mutex_lock(&dirty_i->seglist_lock); while (1) { segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, - offset); + segno + 1); if (segno >= total_segs) break; - offset = segno + 1; if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE])) dirty_i->nr_dirty[PRE]--; @@ -242,11 +255,11 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) * This function should be resided under the curseg_mutex lock */ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, - struct f2fs_summary *sum, unsigned short offset) + struct f2fs_summary *sum) { struct curseg_info *curseg = CURSEG_I(sbi, type); void *addr = curseg->sum_blk; - addr += offset * sizeof(struct f2fs_summary); + addr += curseg->next_blkoff * sizeof(struct f2fs_summary); memcpy(addr, sum, sizeof(struct f2fs_summary)); return; } @@ -296,48 +309,15 @@ static void write_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } -static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, - int ofs_unit, int type) +static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) { - struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE]; - unsigned int segno, next_segno, i; - int ofs = 0; - - /* - * If there is not enough reserved sections, - * we should not reuse prefree segments. - */ - if (has_not_enough_free_secs(sbi, 0)) - return NULL_SEGNO; + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno = curseg->segno + 1; + struct free_segmap_info *free_i = FREE_I(sbi); - /* - * NODE page should not reuse prefree segment, - * since those information is used for SPOR. - */ - if (IS_NODESEG(type)) - return NULL_SEGNO; -next: - segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs++); - ofs = ((segno / ofs_unit) * ofs_unit) + ofs_unit; - if (segno < TOTAL_SEGS(sbi)) { - /* skip intermediate segments in a section */ - if (segno % ofs_unit) - goto next; - - /* skip if whole section is not prefree */ - next_segno = find_next_zero_bit(prefree_segmap, - TOTAL_SEGS(sbi), segno + 1); - if (next_segno - segno < ofs_unit) - goto next; - - /* skip if whole section was not free at the last checkpoint */ - for (i = 0; i < ofs_unit; i++) - if (get_seg_entry(sbi, segno)->ckpt_valid_blocks) - goto next; - return segno; - } - return NULL_SEGNO; + if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec) + return !test_bit(segno, free_i->free_segmap); + return 0; } /* @@ -348,9 +328,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, unsigned int *newseg, bool new_sec, int dir) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int total_secs = sbi->total_sections; unsigned int segno, secno, zoneno; - unsigned int total_zones = sbi->total_sections / sbi->secs_per_zone; + unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone; unsigned int hint = *newseg / sbi->segs_per_sec; unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); unsigned int left_start = hint; @@ -363,16 +342,17 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, TOTAL_SEGS(sbi), *newseg + 1); - if (segno < TOTAL_SEGS(sbi)) + if (segno - *newseg < sbi->segs_per_sec - + (*newseg % sbi->segs_per_sec)) goto got_it; } find_other_zone: - secno = find_next_zero_bit(free_i->free_secmap, total_secs, hint); - if (secno >= total_secs) { + secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint); + if (secno >= TOTAL_SECS(sbi)) { if (dir == ALLOC_RIGHT) { secno = find_next_zero_bit(free_i->free_secmap, - total_secs, 0); - BUG_ON(secno >= total_secs); + TOTAL_SECS(sbi), 0); + BUG_ON(secno >= TOTAL_SECS(sbi)); } else { go_left = 1; left_start = hint - 1; @@ -387,8 +367,8 @@ find_other_zone: continue; } left_start = find_next_zero_bit(free_i->free_secmap, - total_secs, 0); - BUG_ON(left_start >= total_secs); + TOTAL_SECS(sbi), 0); + BUG_ON(left_start >= TOTAL_SECS(sbi)); break; } secno = left_start; @@ -463,7 +443,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) int dir = ALLOC_LEFT; write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); + GET_SUM_BLOCK(sbi, segno)); if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA) dir = ALLOC_RIGHT; @@ -561,26 +541,25 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, int type, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int ofs_unit; if (force) { new_curseg(sbi, type, true); goto out; } - ofs_unit = need_SSR(sbi) ? 1 : sbi->segs_per_sec; - curseg->next_segno = check_prefree_segments(sbi, ofs_unit, type); - - if (curseg->next_segno != NULL_SEGNO) - change_curseg(sbi, type, false); - else if (type == CURSEG_WARM_NODE) + if (type == CURSEG_WARM_NODE) + new_curseg(sbi, type, false); + else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type, true); else new_curseg(sbi, type, false); out: +#ifdef CONFIG_F2FS_STAT_FS sbi->segment_count[curseg->alloc_type]++; +#endif + return; } void allocate_new_segments(struct f2fs_sb_info *sbi) @@ -656,10 +635,16 @@ static void do_submit_bio(struct f2fs_sb_info *sbi, if (type >= META_FLUSH) rw = WRITE_FLUSH_FUA; + if (btype == META) + rw |= REQ_META; + if (sbi->bio[btype]) { struct bio_private *p = sbi->bio[btype]->bi_private; p->sbi = sbi; sbi->bio[btype]->bi_end_io = f2fs_end_io_write; + + trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]); + if (type == META_FLUSH) { DECLARE_COMPLETION_ONSTACK(wait); p->is_sync = true; @@ -696,7 +681,7 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page, do_submit_bio(sbi, type, false); alloc_new: if (sbi->bio[type] == NULL) { - sbi->bio[type] = f2fs_bio_alloc(bdev, bio_get_nr_vecs(bdev)); + sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi)); sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); /* * The end_io will be assigned at the sumbission phase. @@ -714,6 +699,7 @@ alloc_new: sbi->last_block_in_bio[type] = blk_addr; up_write(&sbi->bio_sem); + trace_f2fs_submit_write_page(page, blk_addr, type); } static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) @@ -756,7 +742,7 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type) if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; - else if (is_cold_data(page) || is_cold_file(inode)) + else if (is_cold_data(page) || file_is_cold(inode)) return CURSEG_COLD_DATA; else return CURSEG_WARM_DATA; @@ -805,11 +791,13 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, * because, this function updates a summary entry in the * current summary block. */ - __add_sum_entry(sbi, type, sum, curseg->next_blkoff); + __add_sum_entry(sbi, type, sum); mutex_lock(&sit_i->sentry_lock); __refresh_next_blkoff(sbi, curseg); +#ifdef CONFIG_F2FS_STAT_FS sbi->block_count[curseg->alloc_type]++; +#endif /* * SIT information should be updated before segment allocation, @@ -904,7 +892,7 @@ void recover_data_page(struct f2fs_sb_info *sbi, curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & (sbi->blocks_per_seg - 1); - __add_sum_entry(sbi, type, sum, curseg->next_blkoff); + __add_sum_entry(sbi, type, sum); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); @@ -941,7 +929,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, } curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & (sbi->blocks_per_seg - 1); - __add_sum_entry(sbi, type, sum, curseg->next_blkoff); + __add_sum_entry(sbi, type, sum); /* change the current log to the next block addr in advance */ if (next_segno != segno) { @@ -1390,7 +1378,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) } if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = vzalloc(sbi->total_sections * + sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) * sizeof(struct sec_entry)); if (!sit_i->sec_entries) return -ENOMEM; @@ -1403,10 +1391,9 @@ static int build_sit_info(struct f2fs_sb_info *sbi) bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); - dst_bitmap = kzalloc(bitmap_size, GFP_KERNEL); + dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); if (!dst_bitmap) return -ENOMEM; - memcpy(dst_bitmap, src_bitmap, bitmap_size); /* init SIT information */ sit_i->s_ops = &default_salloc_ops; @@ -1442,7 +1429,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) if (!free_i->free_segmap) return -ENOMEM; - sec_bitmap_size = f2fs_bitmap_size(sbi->total_sections); + sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -1541,13 +1528,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int segno = 0, offset = 0; + unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi); unsigned short valid_blocks; - while (segno < TOTAL_SEGS(sbi)) { + while (1) { /* find dirty segment based on free segmap */ - segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset); - if (segno >= TOTAL_SEGS(sbi)) + segno = find_next_inuse(free_i, total_segs, offset); + if (segno >= total_segs) break; offset = segno + 1; valid_blocks = get_valid_blocks(sbi, segno, 0); @@ -1559,14 +1546,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) } } -static int init_victim_segmap(struct f2fs_sb_info *sbi) +static int init_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); - dirty_i->victim_segmap[FG_GC] = kzalloc(bitmap_size, GFP_KERNEL); - dirty_i->victim_segmap[BG_GC] = kzalloc(bitmap_size, GFP_KERNEL); - if (!dirty_i->victim_segmap[FG_GC] || !dirty_i->victim_segmap[BG_GC]) + dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!dirty_i->victim_secmap) return -ENOMEM; return 0; } @@ -1593,7 +1579,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) } init_dirty_segmap(sbi); - return init_victim_segmap(sbi); + return init_victim_secmap(sbi); } /* @@ -1680,18 +1666,10 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, mutex_unlock(&dirty_i->seglist_lock); } -void reset_victim_segmap(struct f2fs_sb_info *sbi) -{ - unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); - memset(DIRTY_I(sbi)->victim_segmap[FG_GC], 0, bitmap_size); -} - -static void destroy_victim_segmap(struct f2fs_sb_info *sbi) +static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - - kfree(dirty_i->victim_segmap[FG_GC]); - kfree(dirty_i->victim_segmap[BG_GC]); + kfree(dirty_i->victim_secmap); } static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) @@ -1706,7 +1684,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) for (i = 0; i < NR_DIRTY_TYPE; i++) discard_dirty_segmap(sbi, i); - destroy_victim_segmap(sbi); + destroy_victim_secmap(sbi); SM_I(sbi)->dirty_info = NULL; kfree(dirty_i); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 552dadbb2327..062424a0e4c3 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -8,10 +8,13 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#include <linux/blkdev.h> + /* constant macro */ #define NULL_SEGNO ((unsigned int)(~0)) +#define NULL_SECNO ((unsigned int)(~0)) -/* V: Logical segment # in volume, R: Relative segment # in main area */ +/* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) #define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) @@ -23,13 +26,13 @@ ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \ (t == CURSEG_WARM_NODE)) -#define IS_CURSEG(sbi, segno) \ - ((segno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) +#define IS_CURSEG(sbi, seg) \ + ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) #define IS_CURSEC(sbi, secno) \ ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ @@ -81,9 +84,12 @@ #define f2fs_bitmap_size(nr) \ (BITS_TO_LONGS(nr) * sizeof(unsigned long)) #define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments) +#define TOTAL_SECS(sbi) (sbi->total_sections) #define SECTOR_FROM_BLOCK(sbi, blk_addr) \ (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) +#define SECTOR_TO_BLOCK(sbi, sectors) \ + (sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) /* during checkpoint, bio_private is used to synchronize the last bio */ struct bio_private { @@ -213,7 +219,7 @@ struct dirty_seglist_info { unsigned long *dirty_segmap[NR_DIRTY_TYPE]; struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ - unsigned long *victim_segmap[2]; /* BG_GC, FG_GC */ + unsigned long *victim_secmap; /* background GC victims */ }; /* victim selection function for cleaning and SSR */ @@ -464,8 +470,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) static inline int utilization(struct f2fs_sb_info *sbi) { - return (long int)valid_user_blocks(sbi) * 100 / - (long int)sbi->user_block_count; + return div_u64(valid_user_blocks(sbi) * 100, sbi->user_block_count); } /* @@ -616,3 +621,17 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count) - (base + 1) + type; } + +static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) +{ + if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) + return true; + return false; +} + +static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fea6e582a2ed..75c7dc363e92 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -12,7 +12,6 @@ #include <linux/init.h> #include <linux/fs.h> #include <linux/statfs.h> -#include <linux/proc_fs.h> #include <linux/buffer_head.h> #include <linux/backing-dev.h> #include <linux/kthread.h> @@ -21,16 +20,21 @@ #include <linux/seq_file.h> #include <linux/random.h> #include <linux/exportfs.h> +#include <linux/blkdev.h> #include <linux/f2fs_fs.h> #include "f2fs.h" #include "node.h" +#include "segment.h" #include "xattr.h" +#define CREATE_TRACE_POINTS +#include <trace/events/f2fs.h> + static struct kmem_cache *f2fs_inode_cachep; enum { - Opt_gc_background_off, + Opt_gc_background, Opt_disable_roll_forward, Opt_discard, Opt_noheap, @@ -42,7 +46,7 @@ enum { }; static match_table_t f2fs_tokens = { - {Opt_gc_background_off, "background_gc_off"}, + {Opt_gc_background, "background_gc=%s"}, {Opt_disable_roll_forward, "disable_roll_forward"}, {Opt_discard, "discard"}, {Opt_noheap, "no_heap"}, @@ -72,6 +76,91 @@ static void init_once(void *foo) inode_init_once(&fi->vfs_inode); } +static int parse_options(struct super_block *sb, char *options) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + substring_t args[MAX_OPT_ARGS]; + char *p, *name; + int arg = 0; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = NULL; + token = match_token(p, f2fs_tokens, args); + + switch (token) { + case Opt_gc_background: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (!strncmp(name, "on", 2)) + set_opt(sbi, BG_GC); + else if (!strncmp(name, "off", 3)) + clear_opt(sbi, BG_GC); + else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_disable_roll_forward: + set_opt(sbi, DISABLE_ROLL_FORWARD); + break; + case Opt_discard: + set_opt(sbi, DISCARD); + break; + case Opt_noheap: + set_opt(sbi, NOHEAP); + break; +#ifdef CONFIG_F2FS_FS_XATTR + case Opt_nouser_xattr: + clear_opt(sbi, XATTR_USER); + break; +#else + case Opt_nouser_xattr: + f2fs_msg(sb, KERN_INFO, + "nouser_xattr options not supported"); + break; +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + case Opt_noacl: + clear_opt(sbi, POSIX_ACL); + break; +#else + case Opt_noacl: + f2fs_msg(sb, KERN_INFO, "noacl options not supported"); + break; +#endif + case Opt_active_logs: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) + return -EINVAL; + sbi->active_logs = arg; + break; + case Opt_disable_ext_identify: + set_opt(sbi, DISABLE_EXT_IDENTIFY); + break; + default: + f2fs_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" or missing value", + p); + return -EINVAL; + } + } + return 0; +} + static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; @@ -82,7 +171,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); - /* Initilize f2fs-specific inode info */ + /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; atomic_set(&fi->dirty_dents, 0); fi->i_current_depth = 1; @@ -94,6 +183,31 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) return &fi->vfs_inode; } +static int f2fs_drop_inode(struct inode *inode) +{ + /* + * This is to avoid a deadlock condition like below. + * writeback_single_inode(inode) + * - f2fs_write_data_page + * - f2fs_gc -> iput -> evict + * - inode_wait_for_writeback(inode) + */ + if (!inode_unhashed(inode) && inode->i_state & I_SYNC) + return 0; + return generic_drop_inode(inode); +} + +/* + * f2fs_dirty_inode() is called from __mark_inode_dirty() + * + * We should call set_dirty_inode to write the dirty inode through write_inode. + */ +static void f2fs_dirty_inode(struct inode *inode, int flags) +{ + set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + return; +} + static void f2fs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); @@ -132,13 +246,18 @@ int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + trace_f2fs_sync_fs(sb, sync); + if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES)) return 0; - if (sync) + if (sync) { + mutex_lock(&sbi->gc_mutex); write_checkpoint(sbi, false); - else + mutex_unlock(&sbi->gc_mutex); + } else { f2fs_balance_fs(sbi); + } return 0; } @@ -147,7 +266,7 @@ static int f2fs_freeze(struct super_block *sb) { int err; - if (sb->s_flags & MS_RDONLY) + if (f2fs_readonly(sb)) return 0; err = f2fs_sync_fs(sb, 1); @@ -180,7 +299,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = sbi->total_node_count; buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi); - buf->f_namelen = F2FS_MAX_NAME_LEN; + buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); @@ -191,10 +310,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); - if (test_opt(sbi, BG_GC)) - seq_puts(seq, ",background_gc_on"); + if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC)) + seq_printf(seq, ",background_gc=%s", "on"); else - seq_puts(seq, ",background_gc_off"); + seq_printf(seq, ",background_gc=%s", "off"); if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); if (test_opt(sbi, DISCARD)) @@ -221,10 +340,64 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) return 0; } +static int f2fs_remount(struct super_block *sb, int *flags, char *data) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_mount_info org_mount_opt; + int err, active_logs; + + /* + * Save the old mount options in case we + * need to restore them. + */ + org_mount_opt = sbi->mount_opt; + active_logs = sbi->active_logs; + + /* parse mount options */ + err = parse_options(sb, data); + if (err) + goto restore_opts; + + /* + * Previous and new state of filesystem is RO, + * so no point in checking GC conditions. + */ + if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) + goto skip; + + /* + * We stop the GC thread if FS is mounted as RO + * or if background_gc = off is passed in mount + * option. Also sync the filesystem. + */ + if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { + if (sbi->gc_thread) { + stop_gc_thread(sbi); + f2fs_sync_fs(sb, 1); + } + } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) { + err = start_gc_thread(sbi); + if (err) + goto restore_opts; + } +skip: + /* Update the POSIXACL Flag */ + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + return 0; + +restore_opts: + sbi->mount_opt = org_mount_opt; + sbi->active_logs = active_logs; + return err; +} + static struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, + .drop_inode = f2fs_drop_inode, .destroy_inode = f2fs_destroy_inode, .write_inode = f2fs_write_inode, + .dirty_inode = f2fs_dirty_inode, .show_options = f2fs_show_options, .evict_inode = f2fs_evict_inode, .put_super = f2fs_put_super, @@ -232,6 +405,7 @@ static struct super_operations f2fs_sops = { .freeze_fs = f2fs_freeze, .unfreeze_fs = f2fs_unfreeze, .statfs = f2fs_statfs, + .remount_fs = f2fs_remount, }; static struct inode *f2fs_nfs_get_inode(struct super_block *sb, @@ -279,79 +453,6 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi, - char *options) -{ - substring_t args[MAX_OPT_ARGS]; - char *p; - int arg = 0; - - if (!options) - return 0; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - /* - * Initialize args struct so we know whether arg was - * found; some options take optional arguments. - */ - args[0].to = args[0].from = NULL; - token = match_token(p, f2fs_tokens, args); - - switch (token) { - case Opt_gc_background_off: - clear_opt(sbi, BG_GC); - break; - case Opt_disable_roll_forward: - set_opt(sbi, DISABLE_ROLL_FORWARD); - break; - case Opt_discard: - set_opt(sbi, DISCARD); - break; - case Opt_noheap: - set_opt(sbi, NOHEAP); - break; -#ifdef CONFIG_F2FS_FS_XATTR - case Opt_nouser_xattr: - clear_opt(sbi, XATTR_USER); - break; -#else - case Opt_nouser_xattr: - f2fs_msg(sb, KERN_INFO, - "nouser_xattr options not supported"); - break; -#endif -#ifdef CONFIG_F2FS_FS_POSIX_ACL - case Opt_noacl: - clear_opt(sbi, POSIX_ACL); - break; -#else - case Opt_noacl: - f2fs_msg(sb, KERN_INFO, "noacl options not supported"); - break; -#endif - case Opt_active_logs: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) - return -EINVAL; - sbi->active_logs = arg; - break; - case Opt_disable_ext_identify: - set_opt(sbi, DISABLE_EXT_IDENTIFY); - break; - default: - f2fs_msg(sb, KERN_ERR, - "Unrecognized mount option \"%s\" or missing value", - p); - return -EINVAL; - } - } - return 0; -} - static loff_t max_file_size(unsigned bits) { loff_t result = ADDRS_PER_INODE; @@ -457,6 +558,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->root_ino_num = le32_to_cpu(raw_super->root_ino); sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); + sbi->cur_victim_sec = NULL_SECNO; for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); @@ -473,7 +575,7 @@ static int validate_superblock(struct super_block *sb, if (!*raw_super_buf) { f2fs_msg(sb, KERN_ERR, "unable to read %s superblock", super); - return 1; + return -EIO; } *raw_super = (struct f2fs_super_block *) @@ -485,7 +587,7 @@ static int validate_superblock(struct super_block *sb, f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem " "in %s superblock", super); - return 1; + return -EINVAL; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) @@ -508,11 +610,15 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - if (validate_superblock(sb, &raw_super, &raw_super_buf, 0)) { + err = validate_superblock(sb, &raw_super, &raw_super_buf, 0); + if (err) { brelse(raw_super_buf); - if (validate_superblock(sb, &raw_super, &raw_super_buf, 1)) + /* check secondary superblock when primary failed */ + err = validate_superblock(sb, &raw_super, &raw_super_buf, 1); + if (err) goto free_sb_buf; } + sb->s_fs_info = sbi; /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; @@ -525,7 +631,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi, POSIX_ACL); #endif /* parse mount options */ - if (parse_options(sb, sbi, (char *)data)) + err = parse_options(sb, (char *)data); + if (err) goto free_sb_buf; sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); @@ -536,7 +643,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = f2fs_xattr_handlers; sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; - sb->s_fs_info = sbi; sb->s_time_gran = 1; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); @@ -547,11 +653,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->raw_super = raw_super; sbi->raw_super_buf = raw_super_buf; mutex_init(&sbi->gc_mutex); - mutex_init(&sbi->write_inode); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); - for (i = 0; i < NR_LOCK_TYPE; i++) + for (i = 0; i < NR_GLOBAL_LOCKS; i++) mutex_init(&sbi->fs_lock[i]); + mutex_init(&sbi->node_write); sbi->por_doing = 0; spin_lock_init(&sbi->stat_lock); init_rwsem(&sbi->bio_sem); @@ -638,18 +744,36 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } /* recover fsynced data */ - if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) - recover_fsync_data(sbi); + if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { + err = recover_fsync_data(sbi); + if (err) + f2fs_msg(sb, KERN_ERR, + "Cannot recover all fsync data errno=%ld", err); + } - /* After POR, we can run background GC thread */ - err = start_gc_thread(sbi); - if (err) - goto fail; + /* + * If filesystem is not mounted as read-only then + * do start the gc_thread. + */ + if (!(sb->s_flags & MS_RDONLY)) { + /* After POR, we can run background GC thread.*/ + err = start_gc_thread(sbi); + if (err) + goto fail; + } err = f2fs_build_stats(sbi); if (err) goto fail; + if (test_opt(sbi, DISCARD)) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + if (!blk_queue_discard(q)) + f2fs_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but " + "the device does not support discard"); + } + return 0; fail: stop_gc_thread(sbi); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 8038c0496504..3ab07ecd86ca 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -20,6 +20,7 @@ */ #include <linux/rwsem.h> #include <linux/f2fs_fs.h> +#include <linux/security.h> #include "f2fs.h" #include "xattr.h" @@ -43,6 +44,10 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list, prefix = XATTR_TRUSTED_PREFIX; prefix_len = XATTR_TRUSTED_PREFIX_LEN; break; + case F2FS_XATTR_INDEX_SECURITY: + prefix = XATTR_SECURITY_PREFIX; + prefix_len = XATTR_SECURITY_PREFIX_LEN; + break; default: return -EINVAL; } @@ -50,7 +55,7 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list, total_len = prefix_len + name_len + 1; if (list && total_len <= list_size) { memcpy(list, prefix, prefix_len); - memcpy(list+prefix_len, name, name_len); + memcpy(list + prefix_len, name, name_len); list[prefix_len + name_len] = '\0'; } return total_len; @@ -70,13 +75,14 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name, if (!capable(CAP_SYS_ADMIN)) return -EPERM; break; + case F2FS_XATTR_INDEX_SECURITY: + break; default: return -EINVAL; } if (strcmp(name, "") == 0) return -EINVAL; - return f2fs_getxattr(dentry->d_inode, type, name, - buffer, size); + return f2fs_getxattr(dentry->d_inode, type, name, buffer, size); } static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, @@ -93,13 +99,15 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, if (!capable(CAP_SYS_ADMIN)) return -EPERM; break; + case F2FS_XATTR_INDEX_SECURITY: + break; default: return -EINVAL; } if (strcmp(name, "") == 0) return -EINVAL; - return f2fs_setxattr(dentry->d_inode, type, name, value, size); + return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL); } static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, @@ -145,6 +153,31 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, return 0; } +#ifdef CONFIG_F2FS_FS_SECURITY +static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *page) +{ + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, (struct page *)page); + if (err < 0) + break; + } + return err; +} + +int f2fs_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr, struct page *ipage) +{ + return security_inode_init_security(inode, dir, qstr, + &f2fs_initxattrs, ipage); +} +#endif + const struct xattr_handler f2fs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = F2FS_XATTR_INDEX_USER, @@ -169,6 +202,14 @@ const struct xattr_handler f2fs_xattr_advise_handler = { .set = f2fs_xattr_advise_set, }; +const struct xattr_handler f2fs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = F2FS_XATTR_INDEX_SECURITY, + .list = f2fs_xattr_generic_list, + .get = f2fs_xattr_generic_get, + .set = f2fs_xattr_generic_set, +}; + static const struct xattr_handler *f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL @@ -176,6 +217,9 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler, #endif [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, +#ifdef CONFIG_F2FS_FS_SECURITY + [F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler, +#endif [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler, }; @@ -186,6 +230,9 @@ const struct xattr_handler *f2fs_xattr_handlers[] = { &f2fs_xattr_acl_default_handler, #endif &f2fs_xattr_trusted_handler, +#ifdef CONFIG_F2FS_FS_SECURITY + &f2fs_xattr_security_handler, +#endif &f2fs_xattr_advise_handler, NULL, }; @@ -218,6 +265,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name, return -ENODATA; page = get_node_page(sbi, fi->i_xattr_nid); + if (IS_ERR(page)) + return PTR_ERR(page); base_addr = page_address(page); list_for_each_xattr(entry, base_addr) { @@ -268,6 +317,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) return 0; page = get_node_page(sbi, fi->i_xattr_nid); + if (IS_ERR(page)) + return PTR_ERR(page); base_addr = page_address(page); list_for_each_xattr(entry, base_addr) { @@ -296,7 +347,7 @@ cleanup: } int f2fs_setxattr(struct inode *inode, int name_index, const char *name, - const void *value, size_t value_len) + const void *value, size_t value_len, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -307,37 +358,40 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, int error, found, free, newsize; size_t name_len; char *pval; + int ilock; if (name == NULL) return -EINVAL; - name_len = strlen(name); if (value == NULL) value_len = 0; - if (name_len > 255 || value_len > MAX_VALUE_LEN) + name_len = strlen(name); + + if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN) return -ERANGE; f2fs_balance_fs(sbi); - mutex_lock_op(sbi, NODE_NEW); + ilock = mutex_lock_op(sbi); + if (!fi->i_xattr_nid) { /* Allocate new attribute block */ struct dnode_of_data dn; if (!alloc_nid(sbi, &fi->i_xattr_nid)) { - mutex_unlock_op(sbi, NODE_NEW); - return -ENOSPC; + error = -ENOSPC; + goto exit; } set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid); mark_inode_dirty(inode); - page = new_node_page(&dn, XATTR_NODE_OFFSET); + page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); if (IS_ERR(page)) { alloc_nid_failed(sbi, fi->i_xattr_nid); fi->i_xattr_nid = 0; - mutex_unlock_op(sbi, NODE_NEW); - return PTR_ERR(page); + error = PTR_ERR(page); + goto exit; } alloc_nid_done(sbi, fi->i_xattr_nid); @@ -349,8 +403,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, /* The inode already has an extended attribute block. */ page = get_node_page(sbi, fi->i_xattr_nid); if (IS_ERR(page)) { - mutex_unlock_op(sbi, NODE_NEW); - return PTR_ERR(page); + error = PTR_ERR(page); + goto exit; } base_addr = page_address(page); @@ -432,12 +486,16 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, inode->i_ctime = CURRENT_TIME; clear_inode_flag(fi, FI_ACL_MODE); } - f2fs_write_inode(inode, NULL); - mutex_unlock_op(sbi, NODE_NEW); + if (ipage) + update_inode(inode, ipage); + else + update_inode_page(inode); + mutex_unlock_op(sbi, ilock); return 0; cleanup: f2fs_put_page(page, 1); - mutex_unlock_op(sbi, NODE_NEW); +exit: + mutex_unlock_op(sbi, ilock); return error; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 49c9558305e3..3c0817bef25d 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -112,21 +112,19 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler; extern const struct xattr_handler f2fs_xattr_acl_access_handler; extern const struct xattr_handler f2fs_xattr_acl_default_handler; extern const struct xattr_handler f2fs_xattr_advise_handler; +extern const struct xattr_handler f2fs_xattr_security_handler; extern const struct xattr_handler *f2fs_xattr_handlers[]; -extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name, - const void *value, size_t value_len); -extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size); -extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, - size_t buffer_size); - +extern int f2fs_setxattr(struct inode *, int, const char *, + const void *, size_t, struct page *); +extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t); +extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); #else #define f2fs_xattr_handlers NULL static inline int f2fs_setxattr(struct inode *inode, int name_index, - const char *name, const void *value, size_t value_len) + const char *name, const void *value, size_t value_len) { return -EOPNOTSUPP; } @@ -142,4 +140,14 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, } #endif +#ifdef CONFIG_F2FS_FS_SECURITY +extern int f2fs_init_security(struct inode *, struct inode *, + const struct qstr *, struct page *); +#else +static inline int f2fs_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr, struct page *ipage) +{ + return 0; +} +#endif #endif /* __F2FS_XATTR_H__ */ diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 165012ef363a..3963ede84eb0 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -543,6 +543,7 @@ end_of_dir: EXPORT_SYMBOL_GPL(fat_search_long); struct fat_ioctl_filldir_callback { + struct dir_context ctx; void __user *dirent; int result; /* for dir ioctl */ @@ -552,8 +553,9 @@ struct fat_ioctl_filldir_callback { int short_len; }; -static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent, - filldir_t filldir, int short_only, int both) +static int __fat_readdir(struct inode *inode, struct file *file, + struct dir_context *ctx, int short_only, + struct fat_ioctl_filldir_callback *both) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); @@ -564,27 +566,20 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent, unsigned char bufname[FAT_MAX_SHORT_SIZE]; int isvfat = sbi->options.isvfat; const char *fill_name = NULL; - unsigned long inum; - unsigned long lpos, dummy, *furrfu = &lpos; + int fake_offset = 0; loff_t cpos; int short_len = 0, fill_len = 0; int ret = 0; mutex_lock(&sbi->s_lock); - cpos = filp->f_pos; + cpos = ctx->pos; /* Fake . and .. for the root directory. */ if (inode->i_ino == MSDOS_ROOT_INO) { - while (cpos < 2) { - if (filldir(dirent, "..", cpos+1, cpos, - MSDOS_ROOT_INO, DT_DIR) < 0) - goto out; - cpos++; - filp->f_pos++; - } - if (cpos == 2) { - dummy = 2; - furrfu = &dummy; + if (!dir_emit_dots(file, ctx)) + goto out; + if (ctx->pos == 2) { + fake_offset = 1; cpos = 0; } } @@ -619,7 +614,7 @@ parse_record: int status = fat_parse_long(inode, &cpos, &bh, &de, &unicode, &nr_slots); if (status < 0) { - filp->f_pos = cpos; + ctx->pos = cpos; ret = status; goto out; } else if (status == PARSE_INVALID) @@ -639,6 +634,19 @@ parse_record: /* !both && !short_only, so we don't need shortname. */ if (!both) goto start_filldir; + + short_len = fat_parse_short(sb, de, bufname, + sbi->options.dotsOK); + if (short_len == 0) + goto record_end; + /* hack for fat_ioctl_filldir() */ + both->longname = fill_name; + both->long_len = fill_len; + both->shortname = bufname; + both->short_len = short_len; + fill_name = NULL; + fill_len = 0; + goto start_filldir; } } @@ -646,28 +654,21 @@ parse_record: if (short_len == 0) goto record_end; - if (nr_slots) { - /* hack for fat_ioctl_filldir() */ - struct fat_ioctl_filldir_callback *p = dirent; - - p->longname = fill_name; - p->long_len = fill_len; - p->shortname = bufname; - p->short_len = short_len; - fill_name = NULL; - fill_len = 0; - } else { - fill_name = bufname; - fill_len = short_len; - } + fill_name = bufname; + fill_len = short_len; start_filldir: - lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry); - if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) - inum = inode->i_ino; - else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) { - inum = parent_ino(filp->f_path.dentry); + if (!fake_offset) + ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry); + + if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) { + if (!dir_emit_dot(file, ctx)) + goto fill_failed; + } else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) { + if (!dir_emit_dotdot(file, ctx)) + goto fill_failed; } else { + unsigned long inum; loff_t i_pos = fat_make_i_pos(sb, bh, de); struct inode *tmp = fat_iget(sb, i_pos); if (tmp) { @@ -675,18 +676,17 @@ start_filldir: iput(tmp); } else inum = iunique(sb, MSDOS_ROOT_INO); + if (!dir_emit(ctx, fill_name, fill_len, inum, + (de->attr & ATTR_DIR) ? DT_DIR : DT_REG)) + goto fill_failed; } - if (filldir(dirent, fill_name, fill_len, *furrfu, inum, - (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0) - goto fill_failed; - record_end: - furrfu = &lpos; - filp->f_pos = cpos; + fake_offset = 0; + ctx->pos = cpos; goto get_new; end_of_dir: - filp->f_pos = cpos; + ctx->pos = cpos; fill_failed: brelse(bh); if (unicode) @@ -696,10 +696,9 @@ out: return ret; } -static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int fat_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); - return __fat_readdir(inode, filp, dirent, filldir, 0, 0); + return __fat_readdir(file_inode(file), file, ctx, 0, NULL); } #define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \ @@ -755,20 +754,25 @@ efault: \ FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent) -static int fat_ioctl_readdir(struct inode *inode, struct file *filp, +static int fat_ioctl_readdir(struct inode *inode, struct file *file, void __user *dirent, filldir_t filldir, int short_only, int both) { - struct fat_ioctl_filldir_callback buf; + struct fat_ioctl_filldir_callback buf = { + .ctx.actor = filldir, + .dirent = dirent + }; int ret; buf.dirent = dirent; buf.result = 0; mutex_lock(&inode->i_mutex); + buf.ctx.pos = file->f_pos; ret = -ENOENT; if (!IS_DEADDIR(inode)) { - ret = __fat_readdir(inode, filp, &buf, filldir, - short_only, both); + ret = __fat_readdir(inode, file, &buf.ctx, + short_only, both ? &buf : NULL); + file->f_pos = buf.ctx.pos; } mutex_unlock(&inode->i_mutex); if (ret >= 0) @@ -854,7 +858,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd, const struct file_operations fat_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = fat_readdir, + .iterate = fat_readdir, .unlocked_ioctl = fat_dir_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = fat_compat_dir_ioctl, @@ -964,6 +968,29 @@ int fat_scan(struct inode *dir, const unsigned char *name, } EXPORT_SYMBOL_GPL(fat_scan); +/* + * Scans a directory for a given logstart. + * Returns an error code or zero. + */ +int fat_scan_logstart(struct inode *dir, int i_logstart, + struct fat_slot_info *sinfo) +{ + struct super_block *sb = dir->i_sb; + + sinfo->slot_off = 0; + sinfo->bh = NULL; + while (fat_get_short_entry(dir, &sinfo->slot_off, &sinfo->bh, + &sinfo->de) >= 0) { + if (fat_get_start(MSDOS_SB(sb), sinfo->de) == i_logstart) { + sinfo->slot_off -= sizeof(*sinfo->de); + sinfo->nr_slots = 1; + sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); + return 0; + } + } + return -ENOENT; +} + static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots) { struct super_block *sb = dir->i_sb; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e9cc3f0d58e2..4241e6f39e86 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -23,6 +23,9 @@ #define FAT_ERRORS_PANIC 2 /* panic on error */ #define FAT_ERRORS_RO 3 /* remount r/o on error */ +#define FAT_NFS_STALE_RW 1 /* NFS RW support, can cause ESTALE */ +#define FAT_NFS_NOSTALE_RO 2 /* NFS RO support, no ESTALE issue */ + struct fat_mount_options { kuid_t fs_uid; kgid_t fs_gid; @@ -34,6 +37,7 @@ struct fat_mount_options { unsigned short shortname; /* flags for shortname display/create rule */ unsigned char name_check; /* r = relaxed, n = normal, s = strict */ unsigned char errors; /* On error: continue, panic, remount-ro */ + unsigned char nfs; /* NFS support: nostale_ro, stale_rw */ unsigned short allow_utime;/* permission for setting the [am]time */ unsigned quiet:1, /* set = fake successful chmods and chowns */ showexec:1, /* set = only set x bit for com/exe/bat */ @@ -48,8 +52,7 @@ struct fat_mount_options { usefree:1, /* Use free_clusters for FAT32 */ tz_set:1, /* Filesystem timestamps' offset set */ rodir:1, /* allow ATTR_RO for directory */ - discard:1, /* Issue discard requests on deletions */ - nfs:1; /* Do extra work needed for NFS export */ + discard:1; /* Issue discard requests on deletions */ }; #define FAT_HASH_BITS 8 @@ -72,6 +75,7 @@ struct msdos_sb_info { unsigned long root_cluster; /* first cluster of the root directory */ unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */ struct mutex fat_lock; + struct mutex nfs_build_inode_lock; struct mutex s_lock; unsigned int prev_free; /* previously allocated cluster number */ unsigned int free_clusters; /* -1 if undefined */ @@ -82,6 +86,7 @@ struct msdos_sb_info { const void *dir_ops; /* Opaque; default directory operations */ int dir_per_block; /* dir entries per block */ int dir_per_block_bits; /* log2(dir_per_block) */ + unsigned int vol_id; /*volume ID*/ int fatent_shift; struct fatent_operations *fatent_ops; @@ -215,6 +220,27 @@ static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus) + sbi->data_start; } +static inline void fat_get_blknr_offset(struct msdos_sb_info *sbi, + loff_t i_pos, sector_t *blknr, int *offset) +{ + *blknr = i_pos >> sbi->dir_per_block_bits; + *offset = i_pos & (sbi->dir_per_block - 1); +} + +static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi, + struct inode *inode) +{ + loff_t i_pos; +#if BITS_PER_LONG == 32 + spin_lock(&sbi->inode_hash_lock); +#endif + i_pos = MSDOS_I(inode)->i_pos; +#if BITS_PER_LONG == 32 + spin_unlock(&sbi->inode_hash_lock); +#endif + return i_pos; +} + static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len) { #ifdef __BIG_ENDIAN @@ -271,6 +297,8 @@ extern int fat_dir_empty(struct inode *dir); extern int fat_subdirs(struct inode *dir); extern int fat_scan(struct inode *dir, const unsigned char *name, struct fat_slot_info *sinfo); +extern int fat_scan_logstart(struct inode *dir, int i_logstart, + struct fat_slot_info *sinfo); extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, struct msdos_dir_entry **de); extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts); @@ -348,6 +376,7 @@ extern struct inode *fat_build_inode(struct super_block *sb, extern int fat_sync_inode(struct inode *inode); extern int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, void (*setup)(struct super_block *)); +extern int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de); extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); @@ -382,12 +411,8 @@ int fat_cache_init(void); void fat_cache_destroy(void); /* fat/nfs.c */ -struct fid; -extern struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type); -extern struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type); -extern struct dentry *fat_get_parent(struct dentry *child_dir); +extern const struct export_operations fat_export_ops; +extern const struct export_operations fat_export_ops_nostale; /* helper for printk */ typedef unsigned long long llu; diff --git a/fs/fat/file.c b/fs/fat/file.c index 3978f8ca1823..9b104f543056 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -114,6 +114,12 @@ out: return err; } +static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + return put_user(sbi->vol_id, user_attr); +} + long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -124,6 +130,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return fat_ioctl_get_attributes(inode, user_attr); case FAT_IOCTL_SET_ATTRIBUTES: return fat_ioctl_set_attributes(filp, user_attr); + case FAT_IOCTL_GET_VOLUME_ID: + return fat_ioctl_get_volume_id(inode, user_attr); default: return -ENOTTY; /* Inappropriate ioctl for device */ } @@ -306,6 +314,11 @@ int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) struct inode *inode = dentry->d_inode; generic_fillattr(inode, stat); stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; + + if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) { + /* Use i_pos for ino. This is used as fileid of nfs. */ + stat->ino = fat_i_pos_read(MSDOS_SB(inode->i_sb), inode); + } return 0; } EXPORT_SYMBOL_GPL(fat_getattr); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index acf6e479b443..11b51bb55b42 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -18,8 +18,8 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/buffer_head.h> -#include <linux/exportfs.h> #include <linux/mount.h> +#include <linux/aio.h> #include <linux/vfs.h> #include <linux/parser.h> #include <linux/uio.h> @@ -385,7 +385,7 @@ static int fat_calc_dir_size(struct inode *inode) } /* doesn't deal with root inode */ -static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) +int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int error; @@ -444,12 +444,25 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) return 0; } +static inline void fat_lock_build_inode(struct msdos_sb_info *sbi) +{ + if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) + mutex_lock(&sbi->nfs_build_inode_lock); +} + +static inline void fat_unlock_build_inode(struct msdos_sb_info *sbi) +{ + if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) + mutex_unlock(&sbi->nfs_build_inode_lock); +} + struct inode *fat_build_inode(struct super_block *sb, struct msdos_dir_entry *de, loff_t i_pos) { struct inode *inode; int err; + fat_lock_build_inode(MSDOS_SB(sb)); inode = fat_iget(sb, i_pos); if (inode) goto out; @@ -469,6 +482,7 @@ struct inode *fat_build_inode(struct super_block *sb, fat_attach(inode, i_pos); insert_inode_hash(inode); out: + fat_unlock_build_inode(MSDOS_SB(sb)); return inode; } @@ -655,20 +669,6 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi, - struct inode *inode) -{ - loff_t i_pos; -#if BITS_PER_LONG == 32 - spin_lock(&sbi->inode_hash_lock); -#endif - i_pos = MSDOS_I(inode)->i_pos; -#if BITS_PER_LONG == 32 - spin_unlock(&sbi->inode_hash_lock); -#endif - return i_pos; -} - static int __fat_write_inode(struct inode *inode, int wait) { struct super_block *sb = inode->i_sb; @@ -676,7 +676,8 @@ static int __fat_write_inode(struct inode *inode, int wait) struct buffer_head *bh; struct msdos_dir_entry *raw_entry; loff_t i_pos; - int err; + sector_t blocknr; + int err, offset; if (inode->i_ino == MSDOS_ROOT_INO) return 0; @@ -686,7 +687,8 @@ retry: if (!i_pos) return 0; - bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits); + fat_get_blknr_offset(sbi, i_pos, &blocknr, &offset); + bh = sb_bread(sb, blocknr); if (!bh) { fat_msg(sb, KERN_ERR, "unable to read inode block " "for updating (i_pos %lld)", i_pos); @@ -699,8 +701,7 @@ retry: goto retry; } - raw_entry = &((struct msdos_dir_entry *) (bh->b_data)) - [i_pos & (sbi->dir_per_block - 1)]; + raw_entry = &((struct msdos_dir_entry *) (bh->b_data))[offset]; if (S_ISDIR(inode->i_mode)) raw_entry->size = 0; else @@ -761,12 +762,6 @@ static const struct super_operations fat_sops = { .show_options = fat_show_options, }; -static const struct export_operations fat_export_ops = { - .fh_to_dentry = fat_fh_to_dentry, - .fh_to_parent = fat_fh_to_parent, - .get_parent = fat_get_parent, -}; - static int fat_show_options(struct seq_file *m, struct dentry *root) { struct msdos_sb_info *sbi = MSDOS_SB(root->d_sb); @@ -814,8 +809,6 @@ static int fat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",usefree"); if (opts->quiet) seq_puts(m, ",quiet"); - if (opts->nfs) - seq_puts(m, ",nfs"); if (opts->showexec) seq_puts(m, ",showexec"); if (opts->sys_immutable) @@ -849,6 +842,10 @@ static int fat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",errors=panic"); else seq_puts(m, ",errors=remount-ro"); + if (opts->nfs == FAT_NFS_NOSTALE_RO) + seq_puts(m, ",nfs=nostale_ro"); + else if (opts->nfs) + seq_puts(m, ",nfs=stale_rw"); if (opts->discard) seq_puts(m, ",discard"); @@ -865,7 +862,7 @@ enum { Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset, - Opt_err, + Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, }; static const match_table_t fat_tokens = { @@ -895,7 +892,9 @@ static const match_table_t fat_tokens = { {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, {Opt_discard, "discard"}, - {Opt_nfs, "nfs"}, + {Opt_nfs_stale_rw, "nfs"}, + {Opt_nfs_stale_rw, "nfs=stale_rw"}, + {Opt_nfs_nostale_ro, "nfs=nostale_ro"}, {Opt_obsolete, "conv=binary"}, {Opt_obsolete, "conv=text"}, {Opt_obsolete, "conv=auto"}, @@ -1092,6 +1091,12 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, case Opt_err_ro: opts->errors = FAT_ERRORS_RO; break; + case Opt_nfs_stale_rw: + opts->nfs = FAT_NFS_STALE_RW; + break; + case Opt_nfs_nostale_ro: + opts->nfs = FAT_NFS_NOSTALE_RO; + break; /* msdos specific */ case Opt_dots: @@ -1150,9 +1155,6 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, case Opt_discard: opts->discard = 1; break; - case Opt_nfs: - opts->nfs = 1; - break; /* obsolete mount options */ case Opt_obsolete: @@ -1183,6 +1185,10 @@ out: opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH); if (opts->unicode_xlate) opts->utf8 = 0; + if (opts->nfs == FAT_NFS_NOSTALE_RO) { + sb->s_flags |= MS_RDONLY; + sb->s_export_op = &fat_export_ops_nostale; + } return 0; } @@ -1193,7 +1199,7 @@ static int fat_read_root(struct inode *inode) struct msdos_sb_info *sbi = MSDOS_SB(sb); int error; - MSDOS_I(inode)->i_pos = 0; + MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO; inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode->i_version++; @@ -1223,6 +1229,19 @@ static int fat_read_root(struct inode *inode) return 0; } +static unsigned long calc_fat_clusters(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + + /* Divide first to avoid overflow */ + if (sbi->fat_bits != 12) { + unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits; + return ent_per_sec * sbi->fat_length; + } + + return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; +} + /* * Read the super block of an MS-DOS FS. */ @@ -1256,6 +1275,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sb->s_magic = MSDOS_SUPER_MAGIC; sb->s_op = &fat_sops; sb->s_export_op = &fat_export_ops; + mutex_init(&sbi->nfs_build_inode_lock); ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1395,6 +1415,18 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, brelse(fsinfo_bh); } + /* interpret volume ID as a little endian 32 bit integer */ + if (sbi->fat_bits == 32) + sbi->vol_id = (((u32)b->fat32.vol_id[0]) | + ((u32)b->fat32.vol_id[1] << 8) | + ((u32)b->fat32.vol_id[2] << 16) | + ((u32)b->fat32.vol_id[3] << 24)); + else /* fat 16 or 12 */ + sbi->vol_id = (((u32)b->fat16.vol_id[0]) | + ((u32)b->fat16.vol_id[1] << 8) | + ((u32)b->fat16.vol_id[2] << 16) | + ((u32)b->fat16.vol_id[3] << 24)); + sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; @@ -1427,7 +1459,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sbi->dirty = b->fat16.state & FAT_STATE_DIRTY; /* check that FAT table does not overflow */ - fat_clusters = sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; + fat_clusters = calc_fat_clusters(sb); total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); if (total_clusters > MAX_FAT(sb)) { if (!silent) diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 359d307b5507..628e22a5a543 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -30,7 +30,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf); + fat_msg(sb, KERN_ERR, "error, %pV", &vaf); va_end(args); } @@ -38,8 +38,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) { sb->s_flags |= MS_RDONLY; - printk(KERN_ERR "FAT-fs (%s): Filesystem has been " - "set read-only\n", sb->s_id); + fat_msg(sb, KERN_ERR, "Filesystem has been set read-only"); } } EXPORT_SYMBOL_GPL(__fat_fs_error); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 081b759cff83..a783b0e1272a 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -148,8 +148,7 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len, * that the existing dentry can be used. The msdos fs routines will * return ENOENT or EINVAL as appropriate. */ -static int msdos_hash(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +static int msdos_hash(const struct dentry *dentry, struct qstr *qstr) { struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; unsigned char msdos_name[MSDOS_NAME]; @@ -165,8 +164,7 @@ static int msdos_hash(const struct dentry *dentry, const struct inode *inode, * Compare two msdos names. If either of the names are invalid, * we fall back to doing the standard name comparison. */ -static int msdos_cmp(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int msdos_cmp(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 2da952036a3d..6df8d3d885e5 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -107,8 +107,7 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr) * that the existing dentry can be used. The vfat fs routines will * return ENOENT or EINVAL as appropriate. */ -static int vfat_hash(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +static int vfat_hash(const struct dentry *dentry, struct qstr *qstr) { qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr)); return 0; @@ -120,8 +119,7 @@ static int vfat_hash(const struct dentry *dentry, const struct inode *inode, * that the existing dentry can be used. The vfat fs routines will * return ENOENT or EINVAL as appropriate. */ -static int vfat_hashi(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr) { struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io; const unsigned char *name; @@ -142,8 +140,7 @@ static int vfat_hashi(const struct dentry *dentry, const struct inode *inode, /* * Case insensitive compare of two vfat names. */ -static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int vfat_cmpi(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io; @@ -162,8 +159,7 @@ static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode, /* * Case sensitive compare of two vfat names. */ -static int vfat_cmp(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int vfat_cmp(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { unsigned int alen, blen; diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index 499c10438ca2..93e14933dcb6 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -14,6 +14,18 @@ #include <linux/exportfs.h> #include "fat.h" +struct fat_fid { + u32 i_gen; + u32 i_pos_low; + u16 i_pos_hi; + u16 parent_i_pos_hi; + u32 parent_i_pos_low; + u32 parent_i_gen; +}; + +#define FAT_FID_SIZE_WITHOUT_PARENT 3 +#define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32)) + /** * Look up a directory inode given its starting cluster. */ @@ -38,63 +50,252 @@ static struct inode *fat_dget(struct super_block *sb, int i_logstart) return inode; } -static struct inode *fat_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) +static struct inode *fat_ilookup(struct super_block *sb, u64 ino, loff_t i_pos) { - struct inode *inode; + if (MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO) + return fat_iget(sb, i_pos); - if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO)) - return NULL; + else { + if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO)) + return NULL; + return ilookup(sb, ino); + } +} + +static struct inode *__fat_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation, loff_t i_pos) +{ + struct inode *inode = fat_ilookup(sb, ino, i_pos); - inode = ilookup(sb, ino); if (inode && generation && (inode->i_generation != generation)) { iput(inode); inode = NULL; } + if (inode == NULL && MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO) { + struct buffer_head *bh = NULL; + struct msdos_dir_entry *de ; + sector_t blocknr; + int offset; + fat_get_blknr_offset(MSDOS_SB(sb), i_pos, &blocknr, &offset); + bh = sb_bread(sb, blocknr); + if (!bh) { + fat_msg(sb, KERN_ERR, + "unable to read block(%llu) for building NFS inode", + (llu)blocknr); + return inode; + } + de = (struct msdos_dir_entry *)bh->b_data; + /* If a file is deleted on server and client is not updated + * yet, we must not build the inode upon a lookup call. + */ + if (IS_FREE(de[offset].name)) + inode = NULL; + else + inode = fat_build_inode(sb, &de[offset], i_pos); + brelse(bh); + } return inode; } +static struct inode *fat_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + + return __fat_nfs_get_inode(sb, ino, generation, 0); +} + +static int +fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp, + struct inode *parent) +{ + int len = *lenp; + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + struct fat_fid *fid = (struct fat_fid *) fh; + loff_t i_pos; + int type = FILEID_FAT_WITHOUT_PARENT; + + if (parent) { + if (len < FAT_FID_SIZE_WITH_PARENT) { + *lenp = FAT_FID_SIZE_WITH_PARENT; + return FILEID_INVALID; + } + } else { + if (len < FAT_FID_SIZE_WITHOUT_PARENT) { + *lenp = FAT_FID_SIZE_WITHOUT_PARENT; + return FILEID_INVALID; + } + } + + i_pos = fat_i_pos_read(sbi, inode); + *lenp = FAT_FID_SIZE_WITHOUT_PARENT; + fid->i_gen = inode->i_generation; + fid->i_pos_low = i_pos & 0xFFFFFFFF; + fid->i_pos_hi = (i_pos >> 32) & 0xFFFF; + if (parent) { + i_pos = fat_i_pos_read(sbi, parent); + fid->parent_i_pos_hi = (i_pos >> 32) & 0xFFFF; + fid->parent_i_pos_low = i_pos & 0xFFFFFFFF; + fid->parent_i_gen = parent->i_generation; + type = FILEID_FAT_WITH_PARENT; + *lenp = FAT_FID_SIZE_WITH_PARENT; + } + + return type; +} + /** * Map a NFS file handle to a corresponding dentry. * The dentry may or may not be connected to the filesystem root. */ -struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid, +static struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, fat_nfs_get_inode); } +static struct dentry *fat_fh_to_dentry_nostale(struct super_block *sb, + struct fid *fh, int fh_len, + int fh_type) +{ + struct inode *inode = NULL; + struct fat_fid *fid = (struct fat_fid *)fh; + loff_t i_pos; + + switch (fh_type) { + case FILEID_FAT_WITHOUT_PARENT: + if (fh_len < FAT_FID_SIZE_WITHOUT_PARENT) + return NULL; + break; + case FILEID_FAT_WITH_PARENT: + if (fh_len < FAT_FID_SIZE_WITH_PARENT) + return NULL; + break; + default: + return NULL; + } + i_pos = fid->i_pos_hi; + i_pos = (i_pos << 32) | (fid->i_pos_low); + inode = __fat_nfs_get_inode(sb, 0, fid->i_gen, i_pos); + + return d_obtain_alias(inode); +} + /* * Find the parent for a file specified by NFS handle. * This requires that the handle contain the i_ino of the parent. */ -struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid, +static struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, fat_nfs_get_inode); } +static struct dentry *fat_fh_to_parent_nostale(struct super_block *sb, + struct fid *fh, int fh_len, + int fh_type) +{ + struct inode *inode = NULL; + struct fat_fid *fid = (struct fat_fid *)fh; + loff_t i_pos; + + if (fh_len < FAT_FID_SIZE_WITH_PARENT) + return NULL; + + switch (fh_type) { + case FILEID_FAT_WITH_PARENT: + i_pos = fid->parent_i_pos_hi; + i_pos = (i_pos << 32) | (fid->parent_i_pos_low); + inode = __fat_nfs_get_inode(sb, 0, fid->parent_i_gen, i_pos); + break; + } + + return d_obtain_alias(inode); +} + +/* + * Rebuild the parent for a directory that is not connected + * to the filesystem root + */ +static +struct inode *fat_rebuild_parent(struct super_block *sb, int parent_logstart) +{ + int search_clus, clus_to_match; + struct msdos_dir_entry *de; + struct inode *parent = NULL; + struct inode *dummy_grand_parent = NULL; + struct fat_slot_info sinfo; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + sector_t blknr = fat_clus_to_blknr(sbi, parent_logstart); + struct buffer_head *parent_bh = sb_bread(sb, blknr); + if (!parent_bh) { + fat_msg(sb, KERN_ERR, + "unable to read cluster of parent directory"); + return NULL; + } + + de = (struct msdos_dir_entry *) parent_bh->b_data; + clus_to_match = fat_get_start(sbi, &de[0]); + search_clus = fat_get_start(sbi, &de[1]); + + dummy_grand_parent = fat_dget(sb, search_clus); + if (!dummy_grand_parent) { + dummy_grand_parent = new_inode(sb); + if (!dummy_grand_parent) { + brelse(parent_bh); + return parent; + } + + dummy_grand_parent->i_ino = iunique(sb, MSDOS_ROOT_INO); + fat_fill_inode(dummy_grand_parent, &de[1]); + MSDOS_I(dummy_grand_parent)->i_pos = -1; + } + + if (!fat_scan_logstart(dummy_grand_parent, clus_to_match, &sinfo)) + parent = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + + brelse(parent_bh); + iput(dummy_grand_parent); + + return parent; +} + /* * Find the parent for a directory that is not currently connected to * the filesystem root. * * On entry, the caller holds child_dir->d_inode->i_mutex. */ -struct dentry *fat_get_parent(struct dentry *child_dir) +static struct dentry *fat_get_parent(struct dentry *child_dir) { struct super_block *sb = child_dir->d_sb; struct buffer_head *bh = NULL; struct msdos_dir_entry *de; struct inode *parent_inode = NULL; + struct msdos_sb_info *sbi = MSDOS_SB(sb); if (!fat_get_dotdot_entry(child_dir->d_inode, &bh, &de)) { - int parent_logstart = fat_get_start(MSDOS_SB(sb), de); + int parent_logstart = fat_get_start(sbi, de); parent_inode = fat_dget(sb, parent_logstart); + if (!parent_inode && sbi->options.nfs == FAT_NFS_NOSTALE_RO) + parent_inode = fat_rebuild_parent(sb, parent_logstart); } brelse(bh); return d_obtain_alias(parent_inode); } + +const struct export_operations fat_export_ops = { + .fh_to_dentry = fat_fh_to_dentry, + .fh_to_parent = fat_fh_to_parent, + .get_parent = fat_get_parent, +}; + +const struct export_operations fat_export_ops_nostale = { + .encode_fh = fat_encode_fh_nostale, + .fh_to_dentry = fat_fh_to_dentry_nostale, + .fh_to_parent = fat_fh_to_parent_nostale, + .get_parent = fat_get_parent, +}; diff --git a/fs/fifo.c b/fs/fifo.c deleted file mode 100644 index cf6f4345ceb0..000000000000 --- a/fs/fifo.c +++ /dev/null @@ -1,153 +0,0 @@ -/* - * linux/fs/fifo.c - * - * written by Paul H. Hargrove - * - * Fixes: - * 10-06-1999, AV: fixed OOM handling in fifo_open(), moved - * initialization there, switched to external - * allocation of pipe_inode_info. - */ - -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/sched.h> -#include <linux/pipe_fs_i.h> - -static int wait_for_partner(struct inode* inode, unsigned int *cnt) -{ - int cur = *cnt; - - while (cur == *cnt) { - pipe_wait(inode->i_pipe); - if (signal_pending(current)) - break; - } - return cur == *cnt ? -ERESTARTSYS : 0; -} - -static void wake_up_partner(struct inode* inode) -{ - wake_up_interruptible(&inode->i_pipe->wait); -} - -static int fifo_open(struct inode *inode, struct file *filp) -{ - struct pipe_inode_info *pipe; - int ret; - - mutex_lock(&inode->i_mutex); - pipe = inode->i_pipe; - if (!pipe) { - ret = -ENOMEM; - pipe = alloc_pipe_info(inode); - if (!pipe) - goto err_nocleanup; - inode->i_pipe = pipe; - } - filp->f_version = 0; - - /* We can only do regular read/write on fifos */ - filp->f_mode &= (FMODE_READ | FMODE_WRITE); - - switch (filp->f_mode) { - case FMODE_READ: - /* - * O_RDONLY - * POSIX.1 says that O_NONBLOCK means return with the FIFO - * opened, even when there is no process writing the FIFO. - */ - filp->f_op = &read_pipefifo_fops; - pipe->r_counter++; - if (pipe->readers++ == 0) - wake_up_partner(inode); - - if (!pipe->writers) { - if ((filp->f_flags & O_NONBLOCK)) { - /* suppress POLLHUP until we have - * seen a writer */ - filp->f_version = pipe->w_counter; - } else { - if (wait_for_partner(inode, &pipe->w_counter)) - goto err_rd; - } - } - break; - - case FMODE_WRITE: - /* - * O_WRONLY - * POSIX.1 says that O_NONBLOCK means return -1 with - * errno=ENXIO when there is no process reading the FIFO. - */ - ret = -ENXIO; - if ((filp->f_flags & O_NONBLOCK) && !pipe->readers) - goto err; - - filp->f_op = &write_pipefifo_fops; - pipe->w_counter++; - if (!pipe->writers++) - wake_up_partner(inode); - - if (!pipe->readers) { - if (wait_for_partner(inode, &pipe->r_counter)) - goto err_wr; - } - break; - - case FMODE_READ | FMODE_WRITE: - /* - * O_RDWR - * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. - * This implementation will NEVER block on a O_RDWR open, since - * the process can at least talk to itself. - */ - filp->f_op = &rdwr_pipefifo_fops; - - pipe->readers++; - pipe->writers++; - pipe->r_counter++; - pipe->w_counter++; - if (pipe->readers == 1 || pipe->writers == 1) - wake_up_partner(inode); - break; - - default: - ret = -EINVAL; - goto err; - } - - /* Ok! */ - mutex_unlock(&inode->i_mutex); - return 0; - -err_rd: - if (!--pipe->readers) - wake_up_interruptible(&pipe->wait); - ret = -ERESTARTSYS; - goto err; - -err_wr: - if (!--pipe->writers) - wake_up_interruptible(&pipe->wait); - ret = -ERESTARTSYS; - goto err; - -err: - if (!pipe->readers && !pipe->writers) - free_pipe_info(inode); - -err_nocleanup: - mutex_unlock(&inode->i_mutex); - return ret; -} - -/* - * Dummy default file-operations: the only thing this does - * is contain the open that then fills in the correct operations - * depending on the access mode of the file... - */ -const struct file_operations def_fifo_fops = { - .open = fifo_open, /* will set read_ or write_pipefifo_fops */ - .llseek = noop_llseek, -}; diff --git a/fs/file.c b/fs/file.c index 3906d9577a18..4a78f981557a 100644 --- a/fs/file.c +++ b/fs/file.c @@ -23,24 +23,10 @@ #include <linux/rcupdate.h> #include <linux/workqueue.h> -struct fdtable_defer { - spinlock_t lock; - struct work_struct wq; - struct fdtable *next; -}; - int sysctl_nr_open __read_mostly = 1024*1024; int sysctl_nr_open_min = BITS_PER_LONG; int sysctl_nr_open_max = 1024 * 1024; /* raised later */ -/* - * We use this list to defer free fdtables that have vmalloced - * sets/arrays. By keeping a per-cpu list, we avoid having to embed - * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in - * this per-task structure. - */ -static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); - static void *alloc_fdmem(size_t size) { /* @@ -67,46 +53,9 @@ static void __free_fdtable(struct fdtable *fdt) kfree(fdt); } -static void free_fdtable_work(struct work_struct *work) -{ - struct fdtable_defer *f = - container_of(work, struct fdtable_defer, wq); - struct fdtable *fdt; - - spin_lock_bh(&f->lock); - fdt = f->next; - f->next = NULL; - spin_unlock_bh(&f->lock); - while(fdt) { - struct fdtable *next = fdt->next; - - __free_fdtable(fdt); - fdt = next; - } -} - static void free_fdtable_rcu(struct rcu_head *rcu) { - struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); - struct fdtable_defer *fddef; - - BUG_ON(!fdt); - BUG_ON(fdt->max_fds <= NR_OPEN_DEFAULT); - - if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) { - kfree(fdt->fd); - kfree(fdt->open_fds); - kfree(fdt); - } else { - fddef = &get_cpu_var(fdtable_defer_list); - spin_lock(&fddef->lock); - fdt->next = fddef->next; - fddef->next = fdt; - /* vmallocs are handled from the workqueue context */ - schedule_work(&fddef->wq); - spin_unlock(&fddef->lock); - put_cpu_var(fdtable_defer_list); - } + __free_fdtable(container_of(rcu, struct fdtable, rcu)); } /* @@ -174,7 +123,6 @@ static struct fdtable * alloc_fdtable(unsigned int nr) fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; - fdt->next = NULL; return fdt; @@ -221,7 +169,7 @@ static int expand_fdtable(struct files_struct *files, int nr) /* Continue as planned */ copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); - if (cur_fdt->max_fds > NR_OPEN_DEFAULT) + if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); } else { /* Somebody else expanded, so undo our attempt */ @@ -316,7 +264,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->fd = &newf->fd_array[0]; - new_fdt->next = NULL; spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); @@ -490,19 +437,8 @@ void exit_files(struct task_struct *tsk) } } -static void fdtable_defer_list_init(int cpu) -{ - struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); - spin_lock_init(&fddef->lock); - INIT_WORK(&fddef->wq, free_fdtable_work); - fddef->next = NULL; -} - void __init files_defer_init(void) { - int i; - for_each_possible_cpu(i) - fdtable_defer_list_init(i); sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; } diff --git a/fs/file_table.c b/fs/file_table.c index cd4d87a82951..08e719b884ca 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -227,7 +227,7 @@ static void __fput(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_inode; might_sleep(); @@ -306,17 +306,18 @@ void fput(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; + unsigned long flags; + file_sb_list_del(file); - if (unlikely(in_interrupt() || task->flags & PF_KTHREAD)) { - unsigned long flags; - spin_lock_irqsave(&delayed_fput_lock, flags); - list_add(&file->f_u.fu_list, &delayed_fput_list); - schedule_work(&delayed_fput_work); - spin_unlock_irqrestore(&delayed_fput_lock, flags); - return; + if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { + init_task_work(&file->f_u.fu_rcuhead, ____fput); + if (!task_work_add(task, &file->f_u.fu_rcuhead, true)) + return; } - init_task_work(&file->f_u.fu_rcuhead, ____fput); - task_work_add(task, &file->f_u.fu_rcuhead, true); + spin_lock_irqsave(&delayed_fput_lock, flags); + list_add(&file->f_u.fu_list, &delayed_fput_list); + schedule_work(&delayed_fput_work); + spin_unlock_irqrestore(&delayed_fput_lock, flags); } } diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 664b07a53870..25d4099a4aea 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -49,7 +49,7 @@ static struct dentry * vxfs_lookup(struct inode *, struct dentry *, unsigned int); -static int vxfs_readdir(struct file *, void *, filldir_t); +static int vxfs_readdir(struct file *, struct dir_context *); const struct inode_operations vxfs_dir_inode_ops = { .lookup = vxfs_lookup, @@ -58,7 +58,7 @@ const struct inode_operations vxfs_dir_inode_ops = { const struct file_operations vxfs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = vxfs_readdir, + .iterate = vxfs_readdir, }; @@ -235,7 +235,7 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags) * Zero. */ static int -vxfs_readdir(struct file *fp, void *retp, filldir_t filler) +vxfs_readdir(struct file *fp, struct dir_context *ctx) { struct inode *ip = file_inode(fp); struct super_block *sbp = ip->i_sb; @@ -243,20 +243,17 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler) u_long page, npages, block, pblocks, nblocks, offset; loff_t pos; - switch ((long)fp->f_pos) { - case 0: - if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0) - goto out; - fp->f_pos++; - /* fallthrough */ - case 1: - if (filler(retp, "..", 2, fp->f_pos, VXFS_INO(ip)->vii_dotdot, DT_DIR) < 0) - goto out; - fp->f_pos++; - /* fallthrough */ + if (ctx->pos == 0) { + if (!dir_emit_dot(fp, ctx)) + return 0; + ctx->pos = 1; } - - pos = fp->f_pos - 2; + if (ctx->pos == 1) { + if (!dir_emit(ctx, "..", 2, VXFS_INO(ip)->vii_dotdot, DT_DIR)) + return 0; + ctx->pos = 2; + } + pos = ctx->pos - 2; if (pos > VXFS_DIRROUND(ip->i_size)) return 0; @@ -270,16 +267,16 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler) block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks; for (; page < npages; page++, block = 0) { - caddr_t kaddr; + char *kaddr; struct page *pp; pp = vxfs_get_page(ip->i_mapping, page); if (IS_ERR(pp)) continue; - kaddr = (caddr_t)page_address(pp); + kaddr = (char *)page_address(pp); for (; block <= nblocks && block <= pblocks; block++) { - caddr_t baddr, limit; + char *baddr, *limit; struct vxfs_dirblk *dbp; struct vxfs_direct *de; @@ -292,21 +289,18 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler) (kaddr + offset) : (baddr + VXFS_DIRBLKOV(dbp))); - for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) { - int over; - + for (; (char *)de <= limit; de = vxfs_next_entry(de)) { if (!de->d_reclen) break; if (!de->d_ino) continue; - offset = (caddr_t)de - kaddr; - over = filler(retp, de->d_name, de->d_namelen, - ((page << PAGE_CACHE_SHIFT) | offset) + 2, - de->d_ino, DT_UNKNOWN); - if (over) { + offset = (char *)de - kaddr; + ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2; + if (!dir_emit(ctx, de->d_name, de->d_namelen, + de->d_ino, DT_UNKNOWN)) { vxfs_put_page(pp); - goto done; + return 0; } } offset = 0; @@ -314,9 +308,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler) vxfs_put_page(pp); offset = 0; } - -done: - fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2; -out: + ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2; return 0; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 21f46fb3a101..68851ff2fd41 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -22,7 +22,6 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/kthread.h> -#include <linux/freezer.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> @@ -46,6 +45,7 @@ struct wb_writeback_work { unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; + unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ @@ -88,20 +88,6 @@ static inline struct inode *wb_inode(struct list_head *head) #define CREATE_TRACE_POINTS #include <trace/events/writeback.h> -/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ -static void bdi_wakeup_flusher(struct backing_dev_info *bdi) -{ - if (bdi->wb.task) { - wake_up_process(bdi->wb.task); - } else { - /* - * The bdi thread isn't there, wake up the forker thread which - * will create and run it. - */ - wake_up_process(default_backing_dev_info.wb.task); - } -} - static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { @@ -109,10 +95,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi, spin_lock_bh(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); - if (!bdi->wb.task) - trace_writeback_nothread(bdi, work); - bdi_wakeup_flusher(bdi); spin_unlock_bh(&bdi->wb_lock); + + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } static void @@ -127,10 +112,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - if (bdi->wb.task) { - trace_writeback_nowork(bdi); - wake_up_process(bdi->wb.task); - } + trace_writeback_nowork(bdi); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); return; } @@ -177,9 +160,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(bdi); - spin_lock_bh(&bdi->wb_lock); - bdi_wakeup_flusher(bdi); - spin_unlock_bh(&bdi->wb_lock); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } /* @@ -463,9 +444,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* * Make sure to wait on the data before writing out the metadata. * This is important for filesystems that modify metadata on data - * I/O completion. + * I/O completion. We don't do it for sync(2) writeback because it has a + * separate, external IO completion path and ->sync_fs for guaranteeing + * inode metadata is written back correctly. */ - if (wbc->sync_mode == WB_SYNC_ALL) { + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) { int err = filemap_fdatawait(mapping); if (ret == 0) ret = err; @@ -598,6 +581,7 @@ static long writeback_sb_inodes(struct super_block *sb, .tagged_writepages = work->tagged_writepages, .for_kupdate = work->for_kupdate, .for_background = work->for_background, + .for_sync = work->for_sync, .range_cyclic = work->range_cyclic, .range_start = 0, .range_end = LLONG_MAX, @@ -979,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) /* * Retrieve work items and do the writeback they describe */ -long wb_do_writeback(struct bdi_writeback *wb, int force_wait) +static long wb_do_writeback(struct bdi_writeback *wb) { struct backing_dev_info *bdi = wb->bdi; struct wb_writeback_work *work; @@ -987,12 +971,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) set_bit(BDI_writeback_running, &wb->bdi->state); while ((work = get_next_work_item(bdi)) != NULL) { - /* - * Override sync mode, in case we must wait for completion - * because this thread is exiting now. - */ - if (force_wait) - work->sync_mode = WB_SYNC_ALL; trace_writeback_exec(bdi, work); @@ -1020,66 +998,49 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) /* * Handle writeback of dirty data for the device backed by this bdi. Also - * wakes up periodically and does kupdated style flushing. + * reschedules periodically and does kupdated style flushing. */ -int bdi_writeback_thread(void *data) +void bdi_writeback_workfn(struct work_struct *work) { - struct bdi_writeback *wb = data; + struct bdi_writeback *wb = container_of(to_delayed_work(work), + struct bdi_writeback, dwork); struct backing_dev_info *bdi = wb->bdi; long pages_written; + set_worker_desc("flush-%s", dev_name(bdi->dev)); current->flags |= PF_SWAPWRITE; - set_freezable(); - wb->last_active = jiffies; - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - trace_writeback_thread_start(bdi); - - while (!kthread_freezable_should_stop(NULL)) { + if (likely(!current_is_workqueue_rescuer() || + list_empty(&bdi->bdi_list))) { /* - * Remove own delayed wake-up timer, since we are already awake - * and we'll take care of the periodic write-back. + * The normal path. Keep writing back @bdi until its + * work_list is empty. Note that this path is also taken + * if @bdi is shutting down even when we're running off the + * rescuer as work_list needs to be drained. */ - del_timer(&wb->wakeup_timer); - - pages_written = wb_do_writeback(wb, 0); - + do { + pages_written = wb_do_writeback(wb); + trace_writeback_pages_written(pages_written); + } while (!list_empty(&bdi->work_list)); + } else { + /* + * bdi_wq can't get enough workers and we're running off + * the emergency worker. Don't hog it. Hopefully, 1024 is + * enough for efficient IO. + */ + pages_written = writeback_inodes_wb(&bdi->wb, 1024, + WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); - - if (pages_written) - wb->last_active = jiffies; - - set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&bdi->work_list) || kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - continue; - } - - if (wb_has_dirty_io(wb) && dirty_writeback_interval) - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - else { - /* - * We have nothing to do, so can go sleep without any - * timeout and save power. When a work is queued or - * something is made dirty - we will be woken up. - */ - schedule(); - } } - /* Flush any work that raced with us exiting */ - if (!list_empty(&bdi->work_list)) - wb_do_writeback(wb, 1); + if (!list_empty(&bdi->work_list) || + (wb_has_dirty_io(wb) && dirty_writeback_interval)) + queue_delayed_work(bdi_wq, &wb->dwork, + msecs_to_jiffies(dirty_writeback_interval * 10)); - trace_writeback_thread_stop(bdi); - return 0; + current->flags &= ~PF_SWAPWRITE; } - /* * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. @@ -1399,6 +1360,7 @@ void sync_inodes_sb(struct super_block *sb) .range_cyclic = 0, .done = &done, .reason = WB_REASON_SYNC, + .for_sync = 1, }; /* Nothing to do? */ diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index b52aed1dca97..f7cff367db7f 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -115,7 +115,7 @@ struct fscache_cache *fscache_select_cache_for_object( struct fscache_object, cookie_link); cache = object->cache; - if (object->state >= FSCACHE_OBJECT_DYING || + if (fscache_object_is_dying(object) || test_bit(FSCACHE_IOERROR, &cache->flags)) cache = NULL; @@ -224,8 +224,10 @@ int fscache_add_cache(struct fscache_cache *cache, BUG_ON(!ifsdef); cache->flags = 0; - ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED); - ifsdef->state = FSCACHE_OBJECT_ACTIVE; + ifsdef->event_mask = + ((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) & + ~(1 << FSCACHE_OBJECT_EV_CLEARED); + __set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags); if (!tagname) tagname = cache->identifier; @@ -330,25 +332,25 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache, { struct fscache_object *object; - spin_lock(&cache->object_list_lock); - while (!list_empty(&cache->object_list)) { - object = list_entry(cache->object_list.next, - struct fscache_object, cache_link); - list_move_tail(&object->cache_link, dying_objects); + spin_lock(&cache->object_list_lock); - _debug("withdraw %p", object->cookie); + if (!list_empty(&cache->object_list)) { + object = list_entry(cache->object_list.next, + struct fscache_object, cache_link); + list_move_tail(&object->cache_link, dying_objects); - spin_lock(&object->lock); - spin_unlock(&cache->object_list_lock); - fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW); - spin_unlock(&object->lock); + _debug("withdraw %p", object->cookie); + + /* This must be done under object_list_lock to prevent + * a race with fscache_drop_object(). + */ + fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); + } + spin_unlock(&cache->object_list_lock); cond_resched(); - spin_lock(&cache->object_list_lock); } - - spin_unlock(&cache->object_list_lock); } /** diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index e2cba1f60c21..0e91a3c9fdb2 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -95,6 +95,11 @@ struct fscache_cookie *__fscache_acquire_cookie( atomic_set(&cookie->usage, 1); atomic_set(&cookie->n_children, 0); + /* We keep the active count elevated until relinquishment to prevent an + * attempt to wake up every time the object operations queue quiesces. + */ + atomic_set(&cookie->n_active, 1); + atomic_inc(&parent->usage); atomic_inc(&parent->n_children); @@ -177,7 +182,6 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) cookie->flags = (1 << FSCACHE_COOKIE_LOOKING_UP) | - (1 << FSCACHE_COOKIE_CREATING) | (1 << FSCACHE_COOKIE_NO_DATA_YET); /* ask the cache to allocate objects for this cookie and its parent @@ -205,7 +209,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) /* initiate the process of looking up all the objects in the chain * (done by fscache_initialise_object()) */ - fscache_enqueue_object(object); + fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD); spin_unlock(&cookie->lock); @@ -285,7 +289,7 @@ static int fscache_alloc_object(struct fscache_cache *cache, object_already_extant: ret = -ENOBUFS; - if (object->state >= FSCACHE_OBJECT_DYING) { + if (fscache_object_is_dead(object)) { spin_unlock(&cookie->lock); goto error; } @@ -321,7 +325,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie, ret = -EEXIST; hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) { if (p->cache == object->cache) { - if (p->state >= FSCACHE_OBJECT_DYING) + if (fscache_object_is_dying(p)) ret = -ENOBUFS; goto cant_attach_object; } @@ -332,7 +336,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie, hlist_for_each_entry(p, &cookie->parent->backing_objects, cookie_link) { if (p->cache == object->cache) { - if (p->state >= FSCACHE_OBJECT_DYING) { + if (fscache_object_is_dying(p)) { ret = -ENOBUFS; spin_unlock(&cookie->parent->lock); goto cant_attach_object; @@ -400,7 +404,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie) object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); - if (object->state < FSCACHE_OBJECT_DYING) + if (fscache_object_is_live(object)) fscache_raise_event( object, FSCACHE_OBJECT_EV_INVALIDATE); } @@ -467,9 +471,7 @@ EXPORT_SYMBOL(__fscache_update_cookie); */ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) { - struct fscache_cache *cache; struct fscache_object *object; - unsigned long event; fscache_stat(&fscache_n_relinquishes); if (retire) @@ -481,8 +483,11 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) return; } - _enter("%p{%s,%p},%d", - cookie, cookie->def->name, cookie->netfs_data, retire); + _enter("%p{%s,%p,%d},%d", + cookie, cookie->def->name, cookie->netfs_data, + atomic_read(&cookie->n_active), retire); + + ASSERTCMP(atomic_read(&cookie->n_active), >, 0); if (atomic_read(&cookie->n_children) != 0) { printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n", @@ -490,62 +495,28 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) BUG(); } - /* wait for the cookie to finish being instantiated (or to fail) */ - if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) { - fscache_stat(&fscache_n_relinquishes_waitcrt); - wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); - } - - event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE; + /* No further netfs-accessing operations on this cookie permitted */ + set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags); + if (retire) + set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags); -try_again: spin_lock(&cookie->lock); - - /* break links with all the active objects */ - while (!hlist_empty(&cookie->backing_objects)) { - int n_reads; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, - cookie_link); - - _debug("RELEASE OBJ%x", object->debug_id); - - set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags); - n_reads = atomic_read(&object->n_reads); - if (n_reads) { - int n_ops = object->n_ops; - int n_in_progress = object->n_in_progress; - spin_unlock(&cookie->lock); - printk(KERN_ERR "FS-Cache:" - " Cookie '%s' still has %d outstanding reads (%d,%d)\n", - cookie->def->name, - n_reads, n_ops, n_in_progress); - wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); - printk("Wait finished\n"); - goto try_again; - } - - /* detach each cache object from the object cookie */ - spin_lock(&object->lock); - hlist_del_init(&object->cookie_link); - - cache = object->cache; - object->cookie = NULL; - fscache_raise_event(object, event); - spin_unlock(&object->lock); - - if (atomic_dec_and_test(&cookie->usage)) - /* the cookie refcount shouldn't be reduced to 0 yet */ - BUG(); + hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { + fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); } + spin_unlock(&cookie->lock); - /* detach pointers back to the netfs */ + /* Wait for cessation of activity requiring access to the netfs (when + * n_active reaches 0). + */ + if (!atomic_dec_and_test(&cookie->n_active)) + wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t, + TASK_UNINTERRUPTIBLE); + + /* Clear pointers back to the netfs */ cookie->netfs_data = NULL; cookie->def = NULL; - - spin_unlock(&cookie->lock); + BUG_ON(cookie->stores.rnode); if (cookie->parent) { ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); @@ -553,7 +524,7 @@ try_again: atomic_dec(&cookie->parent->n_children); } - /* finally dispose of the cookie */ + /* Dispose of the netfs's link to the cookie */ ASSERTCMP(atomic_read(&cookie->usage), >, 0); fscache_cookie_put(cookie); diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c index f5b4baee7352..10a2ade0bdf8 100644 --- a/fs/fscache/fsdef.c +++ b/fs/fscache/fsdef.c @@ -55,6 +55,7 @@ static struct fscache_cookie_def fscache_fsdef_index_def = { struct fscache_cookie fscache_fsdef_index = { .usage = ATOMIC_INIT(1), + .n_active = ATOMIC_INIT(1), .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock), .backing_objects = HLIST_HEAD_INIT, .def = &fscache_fsdef_index_def, diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index ee38fef4be51..12d505bedb5c 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -93,14 +93,11 @@ static inline bool fscache_object_congested(void) extern int fscache_wait_bit(void *); extern int fscache_wait_bit_interruptible(void *); +extern int fscache_wait_atomic_t(atomic_t *); /* * object.c */ -extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5]; - -extern void fscache_withdrawing_object(struct fscache_cache *, - struct fscache_object *); extern void fscache_enqueue_object(struct fscache_object *); /* @@ -110,8 +107,10 @@ extern void fscache_enqueue_object(struct fscache_object *); extern const struct file_operations fscache_objlist_fops; extern void fscache_objlist_add(struct fscache_object *); +extern void fscache_objlist_remove(struct fscache_object *); #else #define fscache_objlist_add(object) do {} while(0) +#define fscache_objlist_remove(object) do {} while(0) #endif /* @@ -291,6 +290,10 @@ static inline void fscache_raise_event(struct fscache_object *object, unsigned event) { BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS); +#if 0 + printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n", + object->debug_id, object->event_mask, (1 << event)); +#endif if (!test_and_set_bit(event, &object->events) && test_bit(event, &object->event_mask)) fscache_enqueue_object(object); diff --git a/fs/fscache/main.c b/fs/fscache/main.c index f9d856773f79..7c27907e650c 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -205,7 +205,6 @@ int fscache_wait_bit(void *flags) schedule(); return 0; } -EXPORT_SYMBOL(fscache_wait_bit); /* * wait_on_bit() sleep function for interruptible waiting @@ -215,4 +214,12 @@ int fscache_wait_bit_interruptible(void *flags) schedule(); return signal_pending(current); } -EXPORT_SYMBOL(fscache_wait_bit_interruptible); + +/* + * wait_on_atomic_t() sleep function for uninterruptible waiting + */ +int fscache_wait_atomic_t(atomic_t *p) +{ + schedule(); + return 0; +} diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index e028b8eb1c40..b1bb6117473a 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -40,6 +40,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) /* initialise the primary index cookie */ atomic_set(&netfs->primary_index->usage, 1); atomic_set(&netfs->primary_index->n_children, 0); + atomic_set(&netfs->primary_index->n_active, 1); netfs->primary_index->def = &fscache_fsdef_netfs_def; netfs->primary_index->parent = &fscache_fsdef_index; diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index f27c89d17885..e1959efad64f 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -70,13 +70,10 @@ void fscache_objlist_add(struct fscache_object *obj) write_unlock(&fscache_object_list_lock); } -/** - * fscache_object_destroy - Note that a cache object is about to be destroyed - * @object: The object to be destroyed - * - * Note the imminent destruction and deallocation of a cache object record. +/* + * Remove an object from the object list. */ -void fscache_object_destroy(struct fscache_object *obj) +void fscache_objlist_remove(struct fscache_object *obj) { write_lock(&fscache_object_list_lock); @@ -85,7 +82,6 @@ void fscache_object_destroy(struct fscache_object *obj) write_unlock(&fscache_object_list_lock); } -EXPORT_SYMBOL(fscache_object_destroy); /* * find the object in the tree on or after the specified index @@ -166,15 +162,14 @@ static int fscache_objlist_show(struct seq_file *m, void *v) { struct fscache_objlist_data *data = m->private; struct fscache_object *obj = v; + struct fscache_cookie *cookie; unsigned long config = data->config; - uint16_t keylen, auxlen; char _type[3], *type; - bool no_cookie; u8 *buf = data->buf, *p; if ((unsigned long) v == 1) { seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS" - " EM EV F S" + " EM EV FL S" " | NETFS_COOKIE_DEF TY FL NETFS_DATA"); if (config & (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX)) @@ -193,7 +188,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v) if ((unsigned long) v == 2) { seq_puts(m, "======== ======== ==== ===== === === === == =====" - " == == = =" + " == == == =" " | ================ == == ================"); if (config & (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX)) @@ -216,10 +211,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v) } \ } while(0) + cookie = obj->cookie; if (~config) { - FILTER(obj->cookie, + FILTER(cookie->def, COOKIE, NOCOOKIE); - FILTER(obj->state != FSCACHE_OBJECT_ACTIVE || + FILTER(fscache_object_is_active(obj) || obj->n_ops != 0 || obj->n_obj_ops != 0 || obj->flags || @@ -235,10 +231,10 @@ static int fscache_objlist_show(struct seq_file *m, void *v) } seq_printf(m, - "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ", + "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ", obj->debug_id, obj->parent ? obj->parent->debug_id : -1, - fscache_object_states_short[obj->state], + obj->state->short_name, obj->n_children, obj->n_ops, obj->n_obj_ops, @@ -250,48 +246,40 @@ static int fscache_objlist_show(struct seq_file *m, void *v) obj->flags, work_busy(&obj->work)); - no_cookie = true; - keylen = auxlen = 0; - if (obj->cookie) { - spin_lock(&obj->lock); - if (obj->cookie) { - switch (obj->cookie->def->type) { - case 0: - type = "IX"; - break; - case 1: - type = "DT"; - break; - default: - sprintf(_type, "%02u", - obj->cookie->def->type); - type = _type; - break; - } + if (fscache_use_cookie(obj)) { + uint16_t keylen = 0, auxlen = 0; - seq_printf(m, "%-16s %s %2lx %16p", - obj->cookie->def->name, - type, - obj->cookie->flags, - obj->cookie->netfs_data); - - if (obj->cookie->def->get_key && - config & FSCACHE_OBJLIST_CONFIG_KEY) - keylen = obj->cookie->def->get_key( - obj->cookie->netfs_data, - buf, 400); - - if (obj->cookie->def->get_aux && - config & FSCACHE_OBJLIST_CONFIG_AUX) - auxlen = obj->cookie->def->get_aux( - obj->cookie->netfs_data, - buf + keylen, 512 - keylen); - - no_cookie = false; + switch (cookie->def->type) { + case 0: + type = "IX"; + break; + case 1: + type = "DT"; + break; + default: + sprintf(_type, "%02u", cookie->def->type); + type = _type; + break; } - spin_unlock(&obj->lock); - if (!no_cookie && (keylen > 0 || auxlen > 0)) { + seq_printf(m, "%-16s %s %2lx %16p", + cookie->def->name, + type, + cookie->flags, + cookie->netfs_data); + + if (cookie->def->get_key && + config & FSCACHE_OBJLIST_CONFIG_KEY) + keylen = cookie->def->get_key(cookie->netfs_data, + buf, 400); + + if (cookie->def->get_aux && + config & FSCACHE_OBJLIST_CONFIG_AUX) + auxlen = cookie->def->get_aux(cookie->netfs_data, + buf + keylen, 512 - keylen); + fscache_unuse_cookie(obj); + + if (keylen > 0 || auxlen > 0) { seq_printf(m, " "); for (p = buf; keylen > 0; keylen--) seq_printf(m, "%02x", *p++); @@ -302,12 +290,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v) seq_printf(m, "%02x", *p++); } } - } - if (no_cookie) - seq_printf(m, "<no_cookie>\n"); - else seq_printf(m, "\n"); + } else { + seq_printf(m, "<no_netfs>\n"); + } return 0; } diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 50d41c180211..86d75a60b20c 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -15,52 +15,131 @@ #define FSCACHE_DEBUG_LEVEL COOKIE #include <linux/module.h> #include <linux/slab.h> +#include <linux/prefetch.h> #include "internal.h" -const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { - [FSCACHE_OBJECT_INIT] = "OBJECT_INIT", - [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP", - [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING", - [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE", - [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE", - [FSCACHE_OBJECT_INVALIDATING] = "OBJECT_INVALIDATING", - [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING", - [FSCACHE_OBJECT_DYING] = "OBJECT_DYING", - [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING", - [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT", - [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING", - [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING", - [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING", - [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD", +static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int); +static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int); +static const struct fscache_state *fscache_drop_object(struct fscache_object *, int); +static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int); +static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int); +static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int); +static const struct fscache_state *fscache_kill_object(struct fscache_object *, int); +static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int); +static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int); +static const struct fscache_state *fscache_object_available(struct fscache_object *, int); +static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int); +static const struct fscache_state *fscache_update_object(struct fscache_object *, int); + +#define __STATE_NAME(n) fscache_osm_##n +#define STATE(n) (&__STATE_NAME(n)) + +/* + * Define a work state. Work states are execution states. No event processing + * is performed by them. The function attached to a work state returns a + * pointer indicating the next state to which the state machine should + * transition. Returning NO_TRANSIT repeats the current state, but goes back + * to the scheduler first. + */ +#define WORK_STATE(n, sn, f) \ + const struct fscache_state __STATE_NAME(n) = { \ + .name = #n, \ + .short_name = sn, \ + .work = f \ + } + +/* + * Returns from work states. + */ +#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); }) + +#define NO_TRANSIT ((struct fscache_state *)NULL) + +/* + * Define a wait state. Wait states are event processing states. No execution + * is performed by them. Wait states are just tables of "if event X occurs, + * clear it and transition to state Y". The dispatcher returns to the + * scheduler if none of the events in which the wait state has an interest are + * currently pending. + */ +#define WAIT_STATE(n, sn, ...) \ + const struct fscache_state __STATE_NAME(n) = { \ + .name = #n, \ + .short_name = sn, \ + .work = NULL, \ + .transitions = { __VA_ARGS__, { 0, NULL } } \ + } + +#define TRANSIT_TO(state, emask) \ + { .events = (emask), .transit_to = STATE(state) } + +/* + * The object state machine. + */ +static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object); +static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready); +static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation); +static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object); +static WORK_STATE(CREATE_OBJECT, "CRTO", fscache_look_up_object); +static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available); +static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents); + +static WORK_STATE(INVALIDATE_OBJECT, "INVL", fscache_invalidate_object); +static WORK_STATE(UPDATE_OBJECT, "UPDT", fscache_update_object); + +static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure); +static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object); +static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents); +static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object); +static WORK_STATE(OBJECT_DEAD, "DEAD", (void*)2UL); + +static WAIT_STATE(WAIT_FOR_INIT, "?INI", + TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD)); + +static WAIT_STATE(WAIT_FOR_PARENT, "?PRN", + TRANSIT_TO(PARENT_READY, 1 << FSCACHE_OBJECT_EV_PARENT_READY)); + +static WAIT_STATE(WAIT_FOR_CMD, "?CMD", + TRANSIT_TO(INVALIDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_INVALIDATE), + TRANSIT_TO(UPDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_UPDATE), + TRANSIT_TO(JUMPSTART_DEPS, 1 << FSCACHE_OBJECT_EV_NEW_CHILD)); + +static WAIT_STATE(WAIT_FOR_CLEARANCE, "?CLR", + TRANSIT_TO(KILL_OBJECT, 1 << FSCACHE_OBJECT_EV_CLEARED)); + +/* + * Out-of-band event transition tables. These are for handling unexpected + * events, such as an I/O error. If an OOB event occurs, the state machine + * clears and disables the event and forces a transition to the nominated work + * state (acurrently executing work states will complete first). + * + * In such a situation, object->state remembers the state the machine should + * have been in/gone to and returning NO_TRANSIT returns to that. + */ +static const struct fscache_transition fscache_osm_init_oob[] = { + TRANSIT_TO(ABORT_INIT, + (1 << FSCACHE_OBJECT_EV_ERROR) | + (1 << FSCACHE_OBJECT_EV_KILL)), + { 0, NULL } +}; + +static const struct fscache_transition fscache_osm_lookup_oob[] = { + TRANSIT_TO(LOOKUP_FAILURE, + (1 << FSCACHE_OBJECT_EV_ERROR) | + (1 << FSCACHE_OBJECT_EV_KILL)), + { 0, NULL } }; -EXPORT_SYMBOL(fscache_object_states); - -const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = { - [FSCACHE_OBJECT_INIT] = "INIT", - [FSCACHE_OBJECT_LOOKING_UP] = "LOOK", - [FSCACHE_OBJECT_CREATING] = "CRTN", - [FSCACHE_OBJECT_AVAILABLE] = "AVBL", - [FSCACHE_OBJECT_ACTIVE] = "ACTV", - [FSCACHE_OBJECT_INVALIDATING] = "INVL", - [FSCACHE_OBJECT_UPDATING] = "UPDT", - [FSCACHE_OBJECT_DYING] = "DYNG", - [FSCACHE_OBJECT_LC_DYING] = "LCDY", - [FSCACHE_OBJECT_ABORT_INIT] = "ABTI", - [FSCACHE_OBJECT_RELEASING] = "RELS", - [FSCACHE_OBJECT_RECYCLING] = "RCYC", - [FSCACHE_OBJECT_WITHDRAWING] = "WTHD", - [FSCACHE_OBJECT_DEAD] = "DEAD", + +static const struct fscache_transition fscache_osm_run_oob[] = { + TRANSIT_TO(KILL_OBJECT, + (1 << FSCACHE_OBJECT_EV_ERROR) | + (1 << FSCACHE_OBJECT_EV_KILL)), + { 0, NULL } }; static int fscache_get_object(struct fscache_object *); static void fscache_put_object(struct fscache_object *); -static void fscache_initialise_object(struct fscache_object *); -static void fscache_lookup_object(struct fscache_object *); -static void fscache_object_available(struct fscache_object *); -static void fscache_invalidate_object(struct fscache_object *); -static void fscache_release_object(struct fscache_object *); -static void fscache_withdraw_object(struct fscache_object *); -static void fscache_enqueue_dependents(struct fscache_object *); +static bool fscache_enqueue_dependents(struct fscache_object *, int); static void fscache_dequeue_object(struct fscache_object *); /* @@ -75,295 +154,116 @@ static inline void fscache_done_parent_op(struct fscache_object *object) object->debug_id, parent->debug_id, parent->n_ops); spin_lock_nested(&parent->lock, 1); - parent->n_ops--; parent->n_obj_ops--; + parent->n_ops--; if (parent->n_ops == 0) fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); spin_unlock(&parent->lock); } /* - * Notify netfs of invalidation completion. + * Object state machine dispatcher. */ -static inline void fscache_invalidation_complete(struct fscache_cookie *cookie) +static void fscache_object_sm_dispatcher(struct fscache_object *object) { - if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); -} - -/* - * process events that have been sent to an object's state machine - * - initiates parent lookup - * - does object lookup - * - does object creation - * - does object recycling and retirement - * - does object withdrawal - */ -static void fscache_object_state_machine(struct fscache_object *object) -{ - enum fscache_object_state new_state; - struct fscache_cookie *cookie; - int event; + const struct fscache_transition *t; + const struct fscache_state *state, *new_state; + unsigned long events, event_mask; + int event = -1; ASSERT(object != NULL); _enter("{OBJ%x,%s,%lx}", - object->debug_id, fscache_object_states[object->state], - object->events); - - switch (object->state) { - /* wait for the parent object to become ready */ - case FSCACHE_OBJECT_INIT: - object->event_mask = - FSCACHE_OBJECT_EVENTS_MASK & - ~(1 << FSCACHE_OBJECT_EV_CLEARED); - fscache_initialise_object(object); - goto done; - - /* look up the object metadata on disk */ - case FSCACHE_OBJECT_LOOKING_UP: - fscache_lookup_object(object); - goto lookup_transit; - - /* create the object metadata on disk */ - case FSCACHE_OBJECT_CREATING: - fscache_lookup_object(object); - goto lookup_transit; - - /* handle an object becoming available; start pending - * operations and queue dependent operations for processing */ - case FSCACHE_OBJECT_AVAILABLE: - fscache_object_available(object); - goto active_transit; - - /* normal running state */ - case FSCACHE_OBJECT_ACTIVE: - goto active_transit; - - /* Invalidate an object on disk */ - case FSCACHE_OBJECT_INVALIDATING: - clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events); - fscache_stat(&fscache_n_invalidates_run); - fscache_stat(&fscache_n_cop_invalidate_object); - fscache_invalidate_object(object); - fscache_stat_d(&fscache_n_cop_invalidate_object); - fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); - goto active_transit; - - /* update the object metadata on disk */ - case FSCACHE_OBJECT_UPDATING: - clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events); - fscache_stat(&fscache_n_updates_run); - fscache_stat(&fscache_n_cop_update_object); - object->cache->ops->update_object(object); - fscache_stat_d(&fscache_n_cop_update_object); - goto active_transit; - - /* handle an object dying during lookup or creation */ - case FSCACHE_OBJECT_LC_DYING: - object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE); - fscache_stat(&fscache_n_cop_lookup_complete); - object->cache->ops->lookup_complete(object); - fscache_stat_d(&fscache_n_cop_lookup_complete); - - spin_lock(&object->lock); - object->state = FSCACHE_OBJECT_DYING; - cookie = object->cookie; - if (cookie) { - if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, - &cookie->flags)) - wake_up_bit(&cookie->flags, - FSCACHE_COOKIE_LOOKING_UP); - if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, - &cookie->flags)) - wake_up_bit(&cookie->flags, - FSCACHE_COOKIE_CREATING); + object->debug_id, object->state->name, object->events); + + event_mask = object->event_mask; +restart: + object->event_mask = 0; /* Mask normal event handling */ + state = object->state; +restart_masked: + events = object->events; + + /* Handle any out-of-band events (typically an error) */ + if (events & object->oob_event_mask) { + _debug("{OBJ%x} oob %lx", + object->debug_id, events & object->oob_event_mask); + for (t = object->oob_table; t->events; t++) { + if (events & t->events) { + state = t->transit_to; + ASSERT(state->work != NULL); + event = fls(events & t->events) - 1; + __clear_bit(event, &object->oob_event_mask); + clear_bit(event, &object->events); + goto execute_work_state; + } } - spin_unlock(&object->lock); + } - fscache_done_parent_op(object); + /* Wait states are just transition tables */ + if (!state->work) { + if (events & event_mask) { + for (t = state->transitions; t->events; t++) { + if (events & t->events) { + new_state = t->transit_to; + event = fls(events & t->events) - 1; + clear_bit(event, &object->events); + _debug("{OBJ%x} ev %d: %s -> %s", + object->debug_id, event, + state->name, new_state->name); + object->state = state = new_state; + goto execute_work_state; + } + } - /* wait for completion of all active operations on this object - * and the death of all child objects of this object */ - case FSCACHE_OBJECT_DYING: - dying: - clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events); - spin_lock(&object->lock); - _debug("dying OBJ%x {%d,%d}", - object->debug_id, object->n_ops, object->n_children); - if (object->n_ops == 0 && object->n_children == 0) { - object->event_mask &= - ~(1 << FSCACHE_OBJECT_EV_CLEARED); - object->event_mask |= - (1 << FSCACHE_OBJECT_EV_WITHDRAW) | - (1 << FSCACHE_OBJECT_EV_RETIRE) | - (1 << FSCACHE_OBJECT_EV_RELEASE) | - (1 << FSCACHE_OBJECT_EV_ERROR); - } else { - object->event_mask &= - ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) | - (1 << FSCACHE_OBJECT_EV_RETIRE) | - (1 << FSCACHE_OBJECT_EV_RELEASE) | - (1 << FSCACHE_OBJECT_EV_ERROR)); - object->event_mask |= - 1 << FSCACHE_OBJECT_EV_CLEARED; + /* The event mask didn't include all the tabled bits */ + BUG(); } - spin_unlock(&object->lock); - fscache_enqueue_dependents(object); - fscache_start_operations(object); - goto terminal_transit; - - /* handle an abort during initialisation */ - case FSCACHE_OBJECT_ABORT_INIT: - _debug("handle abort init %lx", object->events); - object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE); - - spin_lock(&object->lock); - fscache_dequeue_object(object); - - object->state = FSCACHE_OBJECT_DYING; - if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, - &object->cookie->flags)) - wake_up_bit(&object->cookie->flags, - FSCACHE_COOKIE_CREATING); - spin_unlock(&object->lock); - goto dying; - - /* handle the netfs releasing an object and possibly marking it - * obsolete too */ - case FSCACHE_OBJECT_RELEASING: - case FSCACHE_OBJECT_RECYCLING: - object->event_mask &= - ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) | - (1 << FSCACHE_OBJECT_EV_RETIRE) | - (1 << FSCACHE_OBJECT_EV_RELEASE) | - (1 << FSCACHE_OBJECT_EV_ERROR)); - fscache_release_object(object); - spin_lock(&object->lock); - object->state = FSCACHE_OBJECT_DEAD; - spin_unlock(&object->lock); - fscache_stat(&fscache_n_object_dead); - goto terminal_transit; - - /* handle the parent cache of this object being withdrawn from - * active service */ - case FSCACHE_OBJECT_WITHDRAWING: - object->event_mask &= - ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) | - (1 << FSCACHE_OBJECT_EV_RETIRE) | - (1 << FSCACHE_OBJECT_EV_RELEASE) | - (1 << FSCACHE_OBJECT_EV_ERROR)); - fscache_withdraw_object(object); - spin_lock(&object->lock); - object->state = FSCACHE_OBJECT_DEAD; - spin_unlock(&object->lock); - fscache_stat(&fscache_n_object_dead); - goto terminal_transit; - - /* complain about the object being woken up once it is - * deceased */ - case FSCACHE_OBJECT_DEAD: - printk(KERN_ERR "FS-Cache:" - " Unexpected event in dead state %lx\n", - object->events & object->event_mask); - BUG(); - - default: - printk(KERN_ERR "FS-Cache: Unknown object state %u\n", - object->state); - BUG(); - } - - /* determine the transition from a lookup state */ -lookup_transit: - event = fls(object->events & object->event_mask) - 1; - switch (event) { - case FSCACHE_OBJECT_EV_WITHDRAW: - case FSCACHE_OBJECT_EV_RETIRE: - case FSCACHE_OBJECT_EV_RELEASE: - case FSCACHE_OBJECT_EV_ERROR: - new_state = FSCACHE_OBJECT_LC_DYING; - goto change_state; - case FSCACHE_OBJECT_EV_INVALIDATE: - new_state = FSCACHE_OBJECT_INVALIDATING; - goto change_state; - case FSCACHE_OBJECT_EV_REQUEUE: - goto done; - case -1: - goto done; /* sleep until event */ - default: - goto unsupported_event; + /* Randomly woke up */ + goto unmask_events; } - /* determine the transition from an active state */ -active_transit: - event = fls(object->events & object->event_mask) - 1; - switch (event) { - case FSCACHE_OBJECT_EV_WITHDRAW: - case FSCACHE_OBJECT_EV_RETIRE: - case FSCACHE_OBJECT_EV_RELEASE: - case FSCACHE_OBJECT_EV_ERROR: - new_state = FSCACHE_OBJECT_DYING; - goto change_state; - case FSCACHE_OBJECT_EV_INVALIDATE: - new_state = FSCACHE_OBJECT_INVALIDATING; - goto change_state; - case FSCACHE_OBJECT_EV_UPDATE: - new_state = FSCACHE_OBJECT_UPDATING; - goto change_state; - case -1: - new_state = FSCACHE_OBJECT_ACTIVE; - goto change_state; /* sleep until event */ - default: - goto unsupported_event; - } +execute_work_state: + _debug("{OBJ%x} exec %s", object->debug_id, state->name); - /* determine the transition from a terminal state */ -terminal_transit: - event = fls(object->events & object->event_mask) - 1; - switch (event) { - case FSCACHE_OBJECT_EV_WITHDRAW: - new_state = FSCACHE_OBJECT_WITHDRAWING; - goto change_state; - case FSCACHE_OBJECT_EV_RETIRE: - new_state = FSCACHE_OBJECT_RECYCLING; - goto change_state; - case FSCACHE_OBJECT_EV_RELEASE: - new_state = FSCACHE_OBJECT_RELEASING; - goto change_state; - case FSCACHE_OBJECT_EV_ERROR: - new_state = FSCACHE_OBJECT_WITHDRAWING; - goto change_state; - case FSCACHE_OBJECT_EV_CLEARED: - new_state = FSCACHE_OBJECT_DYING; - goto change_state; - case -1: - goto done; /* sleep until event */ - default: - goto unsupported_event; + new_state = state->work(object, event); + event = -1; + if (new_state == NO_TRANSIT) { + _debug("{OBJ%x} %s notrans", object->debug_id, state->name); + fscache_enqueue_object(object); + event_mask = object->oob_event_mask; + goto unmask_events; } -change_state: - spin_lock(&object->lock); - object->state = new_state; - spin_unlock(&object->lock); + _debug("{OBJ%x} %s -> %s", + object->debug_id, state->name, new_state->name); + object->state = state = new_state; -done: - _leave(" [->%s]", fscache_object_states[object->state]); - return; + if (state->work) { + if (unlikely(state->work == ((void *)2UL))) { + _leave(" [dead]"); + return; + } + goto restart_masked; + } -unsupported_event: - printk(KERN_ERR "FS-Cache:" - " Unsupported event %d [%lx/%lx] in state %s\n", - event, object->events, object->event_mask, - fscache_object_states[object->state]); - BUG(); + /* Transited to wait state */ + event_mask = object->oob_event_mask; + for (t = state->transitions; t->events; t++) + event_mask |= t->events; + +unmask_events: + object->event_mask = event_mask; + smp_mb(); + events = object->events; + if (events & event_mask) + goto restart; + _leave(" [msk %lx]", event_mask); } /* * execute an object */ -void fscache_object_work_func(struct work_struct *work) +static void fscache_object_work_func(struct work_struct *work) { struct fscache_object *object = container_of(work, struct fscache_object, work); @@ -372,14 +272,70 @@ void fscache_object_work_func(struct work_struct *work) _enter("{OBJ%x}", object->debug_id); start = jiffies; - fscache_object_state_machine(object); + fscache_object_sm_dispatcher(object); fscache_hist(fscache_objs_histogram, start); - if (object->events & object->event_mask) - fscache_enqueue_object(object); - clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); fscache_put_object(object); } -EXPORT_SYMBOL(fscache_object_work_func); + +/** + * fscache_object_init - Initialise a cache object description + * @object: Object description + * @cookie: Cookie object will be attached to + * @cache: Cache in which backing object will be found + * + * Initialise a cache object description to its basic values. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +void fscache_object_init(struct fscache_object *object, + struct fscache_cookie *cookie, + struct fscache_cache *cache) +{ + const struct fscache_transition *t; + + atomic_inc(&cache->object_count); + + object->state = STATE(WAIT_FOR_INIT); + object->oob_table = fscache_osm_init_oob; + object->flags = 1 << FSCACHE_OBJECT_IS_LIVE; + spin_lock_init(&object->lock); + INIT_LIST_HEAD(&object->cache_link); + INIT_HLIST_NODE(&object->cookie_link); + INIT_WORK(&object->work, fscache_object_work_func); + INIT_LIST_HEAD(&object->dependents); + INIT_LIST_HEAD(&object->dep_link); + INIT_LIST_HEAD(&object->pending_ops); + object->n_children = 0; + object->n_ops = object->n_in_progress = object->n_exclusive = 0; + object->events = 0; + object->store_limit = 0; + object->store_limit_l = 0; + object->cache = cache; + object->cookie = cookie; + object->parent = NULL; + + object->oob_event_mask = 0; + for (t = object->oob_table; t->events; t++) + object->oob_event_mask |= t->events; + object->event_mask = object->oob_event_mask; + for (t = object->state->transitions; t->events; t++) + object->event_mask |= t->events; +} +EXPORT_SYMBOL(fscache_object_init); + +/* + * Abort object initialisation before we start it. + */ +static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object, + int event) +{ + _enter("{OBJ%x},%d", object->debug_id, event); + + object->oob_event_mask = 0; + fscache_dequeue_object(object); + return transit_to(KILL_OBJECT); +} /* * initialise an object @@ -387,130 +343,136 @@ EXPORT_SYMBOL(fscache_object_work_func); * immediately to do a creation * - we may need to start the process of creating a parent and we need to wait * for the parent's lookup and creation to complete if it's not there yet - * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the - * leaf-most cookies of the object and all its children */ -static void fscache_initialise_object(struct fscache_object *object) +static const struct fscache_state *fscache_initialise_object(struct fscache_object *object, + int event) { struct fscache_object *parent; + bool success; - _enter(""); - ASSERT(object->cookie != NULL); - ASSERT(object->cookie->parent != NULL); - - if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) | - (1 << FSCACHE_OBJECT_EV_RELEASE) | - (1 << FSCACHE_OBJECT_EV_RETIRE) | - (1 << FSCACHE_OBJECT_EV_WITHDRAW))) { - _debug("abort init %lx", object->events); - spin_lock(&object->lock); - object->state = FSCACHE_OBJECT_ABORT_INIT; - spin_unlock(&object->lock); - return; - } + _enter("{OBJ%x},%d", object->debug_id, event); - spin_lock(&object->cookie->lock); - spin_lock_nested(&object->cookie->parent->lock, 1); + ASSERT(list_empty(&object->dep_link)); parent = object->parent; if (!parent) { - _debug("no parent"); - set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); - } else { - spin_lock(&object->lock); - spin_lock_nested(&parent->lock, 1); - _debug("parent %s", fscache_object_states[parent->state]); - - if (parent->state >= FSCACHE_OBJECT_DYING) { - _debug("bad parent"); - set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); - } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) { - _debug("wait"); - - /* we may get woken up in this state by child objects - * binding on to us, so we need to make sure we don't - * add ourself to the list multiple times */ - if (list_empty(&object->dep_link)) { - fscache_stat(&fscache_n_cop_grab_object); - object->cache->ops->grab_object(object); - fscache_stat_d(&fscache_n_cop_grab_object); - list_add(&object->dep_link, - &parent->dependents); - - /* fscache_acquire_non_index_cookie() uses this - * to wake the chain up */ - if (parent->state == FSCACHE_OBJECT_INIT) - fscache_enqueue_object(parent); - } - } else { - _debug("go"); - parent->n_ops++; - parent->n_obj_ops++; - object->lookup_jif = jiffies; - object->state = FSCACHE_OBJECT_LOOKING_UP; - set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); - } + _leave(" [no parent]"); + return transit_to(DROP_OBJECT); + } - spin_unlock(&parent->lock); - spin_unlock(&object->lock); + _debug("parent: %s of:%lx", parent->state->name, parent->flags); + + if (fscache_object_is_dying(parent)) { + _leave(" [bad parent]"); + return transit_to(DROP_OBJECT); } - spin_unlock(&object->cookie->parent->lock); - spin_unlock(&object->cookie->lock); + if (fscache_object_is_available(parent)) { + _leave(" [ready]"); + return transit_to(PARENT_READY); + } + + _debug("wait"); + + spin_lock(&parent->lock); + fscache_stat(&fscache_n_cop_grab_object); + success = false; + if (fscache_object_is_live(parent) && + object->cache->ops->grab_object(object)) { + list_add(&object->dep_link, &parent->dependents); + success = true; + } + fscache_stat_d(&fscache_n_cop_grab_object); + spin_unlock(&parent->lock); + if (!success) { + _leave(" [grab failed]"); + return transit_to(DROP_OBJECT); + } + + /* fscache_acquire_non_index_cookie() uses this + * to wake the chain up */ + fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD); + _leave(" [wait]"); + return transit_to(WAIT_FOR_PARENT); +} + +/* + * Once the parent object is ready, we should kick off our lookup op. + */ +static const struct fscache_state *fscache_parent_ready(struct fscache_object *object, + int event) +{ + struct fscache_object *parent = object->parent; + + _enter("{OBJ%x},%d", object->debug_id, event); + + ASSERT(parent != NULL); + + spin_lock(&parent->lock); + parent->n_ops++; + parent->n_obj_ops++; + object->lookup_jif = jiffies; + spin_unlock(&parent->lock); + _leave(""); + return transit_to(LOOK_UP_OBJECT); } /* * look an object up in the cache from which it was allocated * - we hold an "access lock" on the parent object, so the parent object cannot * be withdrawn by either party till we've finished - * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the - * leaf-most cookies of the object and all its children */ -static void fscache_lookup_object(struct fscache_object *object) +static const struct fscache_state *fscache_look_up_object(struct fscache_object *object, + int event) { struct fscache_cookie *cookie = object->cookie; - struct fscache_object *parent; + struct fscache_object *parent = object->parent; int ret; - _enter(""); + _enter("{OBJ%x},%d", object->debug_id, event); + + object->oob_table = fscache_osm_lookup_oob; - parent = object->parent; ASSERT(parent != NULL); ASSERTCMP(parent->n_ops, >, 0); ASSERTCMP(parent->n_obj_ops, >, 0); /* make sure the parent is still available */ - ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE); - - if (parent->state >= FSCACHE_OBJECT_DYING || - test_bit(FSCACHE_IOERROR, &object->cache->flags)) { - _debug("unavailable"); - set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); - _leave(""); - return; + ASSERT(fscache_object_is_available(parent)); + + if (fscache_object_is_dying(parent) || + test_bit(FSCACHE_IOERROR, &object->cache->flags) || + !fscache_use_cookie(object)) { + _leave(" [unavailable]"); + return transit_to(LOOKUP_FAILURE); } - _debug("LOOKUP \"%s/%s\" in \"%s\"", - parent->cookie->def->name, cookie->def->name, - object->cache->tag->name); + _debug("LOOKUP \"%s\" in \"%s\"", + cookie->def->name, object->cache->tag->name); fscache_stat(&fscache_n_object_lookups); fscache_stat(&fscache_n_cop_lookup_object); ret = object->cache->ops->lookup_object(object); fscache_stat_d(&fscache_n_cop_lookup_object); - if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events)) - set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); + fscache_unuse_cookie(object); if (ret == -ETIMEDOUT) { /* probably stuck behind another object, so move this one to * the back of the queue */ fscache_stat(&fscache_n_object_lookups_timed_out); - set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + _leave(" [timeout]"); + return NO_TRANSIT; } - _leave(""); + if (ret < 0) { + _leave(" [error]"); + return transit_to(LOOKUP_FAILURE); + } + + _leave(" [ok]"); + return transit_to(OBJECT_AVAILABLE); } /** @@ -524,32 +486,20 @@ void fscache_object_lookup_negative(struct fscache_object *object) { struct fscache_cookie *cookie = object->cookie; - _enter("{OBJ%x,%s}", - object->debug_id, fscache_object_states[object->state]); + _enter("{OBJ%x,%s}", object->debug_id, object->state->name); - spin_lock(&object->lock); - if (object->state == FSCACHE_OBJECT_LOOKING_UP) { + if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { fscache_stat(&fscache_n_object_lookups_negative); - /* transit here to allow write requests to begin stacking up - * and read requests to begin returning ENODATA */ - object->state = FSCACHE_OBJECT_CREATING; - spin_unlock(&object->lock); - - set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags); + /* Allow write requests to begin stacking up and read requests to begin + * returning ENODATA. + */ set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); _debug("wake up lookup %p", &cookie->flags); - smp_mb__before_clear_bit(); - clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); - smp_mb__after_clear_bit(); + clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); - set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); - } else { - ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING); - spin_unlock(&object->lock); } - _leave(""); } EXPORT_SYMBOL(fscache_object_lookup_negative); @@ -568,38 +518,26 @@ void fscache_obtained_object(struct fscache_object *object) { struct fscache_cookie *cookie = object->cookie; - _enter("{OBJ%x,%s}", - object->debug_id, fscache_object_states[object->state]); + _enter("{OBJ%x,%s}", object->debug_id, object->state->name); /* if we were still looking up, then we must have a positive lookup * result, in which case there may be data available */ - spin_lock(&object->lock); - if (object->state == FSCACHE_OBJECT_LOOKING_UP) { + if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { fscache_stat(&fscache_n_object_lookups_positive); - clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + /* We do (presumably) have data */ + clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); - object->state = FSCACHE_OBJECT_AVAILABLE; - spin_unlock(&object->lock); - - smp_mb__before_clear_bit(); - clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); - smp_mb__after_clear_bit(); + /* Allow write requests to begin stacking up and read requests + * to begin shovelling data. + */ + clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); - set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); } else { - ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING); fscache_stat(&fscache_n_object_created); - - object->state = FSCACHE_OBJECT_AVAILABLE; - spin_unlock(&object->lock); - set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); - smp_wmb(); } - if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING); - + set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags); _leave(""); } EXPORT_SYMBOL(fscache_obtained_object); @@ -607,15 +545,14 @@ EXPORT_SYMBOL(fscache_obtained_object); /* * handle an object that has just become available */ -static void fscache_object_available(struct fscache_object *object) +static const struct fscache_state *fscache_object_available(struct fscache_object *object, + int event) { - _enter("{OBJ%x}", object->debug_id); + _enter("{OBJ%x},%d", object->debug_id, event); - spin_lock(&object->lock); + object->oob_table = fscache_osm_run_oob; - if (object->cookie && - test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags)) - wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING); + spin_lock(&object->lock); fscache_done_parent_op(object); if (object->n_in_progress == 0) { @@ -631,130 +568,158 @@ static void fscache_object_available(struct fscache_object *object) fscache_stat(&fscache_n_cop_lookup_complete); object->cache->ops->lookup_complete(object); fscache_stat_d(&fscache_n_cop_lookup_complete); - fscache_enqueue_dependents(object); fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif); fscache_stat(&fscache_n_object_avail); _leave(""); + return transit_to(JUMPSTART_DEPS); } /* - * drop an object's attachments + * Wake up this object's dependent objects now that we've become available. */ -static void fscache_drop_object(struct fscache_object *object) +static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object, + int event) { - struct fscache_object *parent = object->parent; - struct fscache_cache *cache = object->cache; + _enter("{OBJ%x},%d", object->debug_id, event); - _enter("{OBJ%x,%d}", object->debug_id, object->n_children); + if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY)) + return NO_TRANSIT; /* Not finished; requeue */ + return transit_to(WAIT_FOR_CMD); +} - ASSERTCMP(object->cookie, ==, NULL); - ASSERT(hlist_unhashed(&object->cookie_link)); +/* + * Handle lookup or creation failute. + */ +static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object, + int event) +{ + struct fscache_cookie *cookie; - spin_lock(&cache->object_list_lock); - list_del_init(&object->cache_link); - spin_unlock(&cache->object_list_lock); + _enter("{OBJ%x},%d", object->debug_id, event); - fscache_stat(&fscache_n_cop_drop_object); - cache->ops->drop_object(object); - fscache_stat_d(&fscache_n_cop_drop_object); + object->oob_event_mask = 0; - if (parent) { - _debug("release parent OBJ%x {%d}", - parent->debug_id, parent->n_children); + fscache_stat(&fscache_n_cop_lookup_complete); + object->cache->ops->lookup_complete(object); + fscache_stat_d(&fscache_n_cop_lookup_complete); - spin_lock(&parent->lock); - parent->n_children--; - if (parent->n_children == 0) - fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); - spin_unlock(&parent->lock); - object->parent = NULL; + cookie = object->cookie; + set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); + if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); + + fscache_done_parent_op(object); + return transit_to(KILL_OBJECT); +} + +/* + * Wait for completion of all active operations on this object and the death of + * all child objects of this object. + */ +static const struct fscache_state *fscache_kill_object(struct fscache_object *object, + int event) +{ + _enter("{OBJ%x,%d,%d},%d", + object->debug_id, object->n_ops, object->n_children, event); + + clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + object->oob_event_mask = 0; + + if (list_empty(&object->dependents) && + object->n_ops == 0 && + object->n_children == 0) + return transit_to(DROP_OBJECT); + + if (object->n_in_progress == 0) { + spin_lock(&object->lock); + if (object->n_ops > 0 && object->n_in_progress == 0) + fscache_start_operations(object); + spin_unlock(&object->lock); } - /* this just shifts the object release to the work processor */ - fscache_put_object(object); + if (!list_empty(&object->dependents)) + return transit_to(KILL_DEPENDENTS); - _leave(""); + return transit_to(WAIT_FOR_CLEARANCE); } /* - * release or recycle an object that the netfs has discarded + * Kill dependent objects. */ -static void fscache_release_object(struct fscache_object *object) +static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object, + int event) { - _enter(""); + _enter("{OBJ%x},%d", object->debug_id, event); - fscache_drop_object(object); + if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL)) + return NO_TRANSIT; /* Not finished */ + return transit_to(WAIT_FOR_CLEARANCE); } /* - * withdraw an object from active service + * Drop an object's attachments */ -static void fscache_withdraw_object(struct fscache_object *object) +static const struct fscache_state *fscache_drop_object(struct fscache_object *object, + int event) { - struct fscache_cookie *cookie; - bool detached; + struct fscache_object *parent = object->parent; + struct fscache_cookie *cookie = object->cookie; + struct fscache_cache *cache = object->cache; + bool awaken = false; - _enter(""); + _enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event); - spin_lock(&object->lock); - cookie = object->cookie; - if (cookie) { - /* need to get the cookie lock before the object lock, starting - * from the object pointer */ - atomic_inc(&cookie->usage); - spin_unlock(&object->lock); + ASSERT(cookie != NULL); + ASSERT(!hlist_unhashed(&object->cookie_link)); - detached = false; - spin_lock(&cookie->lock); - spin_lock(&object->lock); + /* Make sure the cookie no longer points here and that the netfs isn't + * waiting for us. + */ + spin_lock(&cookie->lock); + hlist_del_init(&object->cookie_link); + if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) + awaken = true; + spin_unlock(&cookie->lock); - if (object->cookie == cookie) { - hlist_del_init(&object->cookie_link); - object->cookie = NULL; - fscache_invalidation_complete(cookie); - detached = true; - } - spin_unlock(&cookie->lock); - fscache_cookie_put(cookie); - if (detached) - fscache_cookie_put(cookie); - } + if (awaken) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); + /* Prevent a race with our last child, which has to signal EV_CLEARED + * before dropping our spinlock. + */ + spin_lock(&object->lock); spin_unlock(&object->lock); - fscache_drop_object(object); -} + /* Discard from the cache's collection of objects */ + spin_lock(&cache->object_list_lock); + list_del_init(&object->cache_link); + spin_unlock(&cache->object_list_lock); -/* - * withdraw an object from active service at the behest of the cache - * - need break the links to a cached object cookie - * - called under two situations: - * (1) recycler decides to reclaim an in-use object - * (2) a cache is unmounted - * - have to take care as the cookie can be being relinquished by the netfs - * simultaneously - * - the object is pinned by the caller holding a refcount on it - */ -void fscache_withdrawing_object(struct fscache_cache *cache, - struct fscache_object *object) -{ - bool enqueue = false; + fscache_stat(&fscache_n_cop_drop_object); + cache->ops->drop_object(object); + fscache_stat_d(&fscache_n_cop_drop_object); - _enter(",OBJ%x", object->debug_id); + /* The parent object wants to know when all it dependents have gone */ + if (parent) { + _debug("release parent OBJ%x {%d}", + parent->debug_id, parent->n_children); - spin_lock(&object->lock); - if (object->state < FSCACHE_OBJECT_WITHDRAWING) { - object->state = FSCACHE_OBJECT_WITHDRAWING; - enqueue = true; + spin_lock(&parent->lock); + parent->n_children--; + if (parent->n_children == 0) + fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); + spin_unlock(&parent->lock); + object->parent = NULL; } - spin_unlock(&object->lock); - if (enqueue) - fscache_enqueue_object(object); + /* this just shifts the object release to the work processor */ + fscache_put_object(object); + fscache_stat(&fscache_n_object_dead); _leave(""); + return transit_to(OBJECT_DEAD); } /* @@ -771,7 +736,7 @@ static int fscache_get_object(struct fscache_object *object) } /* - * discard a ref on a work item + * Discard a ref on an object */ static void fscache_put_object(struct fscache_object *object) { @@ -780,6 +745,22 @@ static void fscache_put_object(struct fscache_object *object) fscache_stat_d(&fscache_n_cop_put_object); } +/** + * fscache_object_destroy - Note that a cache object is about to be destroyed + * @object: The object to be destroyed + * + * Note the imminent destruction and deallocation of a cache object record. + */ +void fscache_object_destroy(struct fscache_object *object) +{ + fscache_objlist_remove(object); + + /* We can get rid of the cookie now */ + fscache_cookie_put(object->cookie); + object->cookie = NULL; +} +EXPORT_SYMBOL(fscache_object_destroy); + /* * enqueue an object for metadata-type processing */ @@ -803,7 +784,7 @@ void fscache_enqueue_object(struct fscache_object *object) /** * fscache_object_sleep_till_congested - Sleep until object wq is congested - * @timoutp: Scheduler sleep timeout + * @timeoutp: Scheduler sleep timeout * * Allow an object handler to sleep until the object workqueue is congested. * @@ -831,18 +812,21 @@ bool fscache_object_sleep_till_congested(signed long *timeoutp) EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested); /* - * enqueue the dependents of an object for metadata-type processing - * - the caller must hold the object's lock - * - this may cause an already locked object to wind up being processed again + * Enqueue the dependents of an object for metadata-type processing. + * + * If we don't manage to finish the list before the scheduler wants to run + * again then return false immediately. We return true if the list was + * cleared. */ -static void fscache_enqueue_dependents(struct fscache_object *object) +static bool fscache_enqueue_dependents(struct fscache_object *object, int event) { struct fscache_object *dep; + bool ret = true; _enter("{OBJ%x}", object->debug_id); if (list_empty(&object->dependents)) - return; + return true; spin_lock(&object->lock); @@ -851,23 +835,23 @@ static void fscache_enqueue_dependents(struct fscache_object *object) struct fscache_object, dep_link); list_del_init(&dep->dep_link); - - /* sort onto appropriate lists */ - fscache_enqueue_object(dep); + fscache_raise_event(dep, event); fscache_put_object(dep); - if (!list_empty(&object->dependents)) - cond_resched_lock(&object->lock); + if (!list_empty(&object->dependents) && need_resched()) { + ret = false; + break; + } } spin_unlock(&object->lock); + return ret; } /* * remove an object from whatever queue it's waiting on - * - the caller must hold object->lock */ -void fscache_dequeue_object(struct fscache_object *object) +static void fscache_dequeue_object(struct fscache_object *object) { _enter("{OBJ%x}", object->debug_id); @@ -886,7 +870,10 @@ void fscache_dequeue_object(struct fscache_object *object) * @data: The auxiliary data for the object * @datalen: The size of the auxiliary data * - * This function consults the netfs about the coherency state of an object + * This function consults the netfs about the coherency state of an object. + * The caller must be holding a ref on cookie->n_active (held by + * fscache_look_up_object() on behalf of the cache backend during object lookup + * and creation). */ enum fscache_checkaux fscache_check_aux(struct fscache_object *object, const void *data, uint16_t datalen) @@ -927,12 +914,23 @@ EXPORT_SYMBOL(fscache_check_aux); /* * Asynchronously invalidate an object. */ -static void fscache_invalidate_object(struct fscache_object *object) +static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object, + int event) { struct fscache_operation *op; struct fscache_cookie *cookie = object->cookie; - _enter("{OBJ%x}", object->debug_id); + _enter("{OBJ%x},%d", object->debug_id, event); + + /* We're going to need the cookie. If the cookie is not available then + * retire the object instead. + */ + if (!fscache_use_cookie(object)) { + ASSERT(object->cookie->stores.rnode == NULL); + set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags); + _leave(" [no cookie]"); + return transit_to(KILL_OBJECT); + } /* Reject any new read/write ops and abort any that are pending. */ fscache_invalidate_writes(cookie); @@ -941,14 +939,13 @@ static void fscache_invalidate_object(struct fscache_object *object) /* Now we have to wait for in-progress reads and writes */ op = kzalloc(sizeof(*op), GFP_KERNEL); - if (!op) { - fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); - _leave(" [ENOMEM]"); - return; - } + if (!op) + goto nomem; fscache_operation_init(op, object->cache->ops->invalidate_object, NULL); - op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); + op->flags = FSCACHE_OP_ASYNC | + (1 << FSCACHE_OP_EXCLUSIVE) | + (1 << FSCACHE_OP_UNUSE_COOKIE); spin_lock(&cookie->lock); if (fscache_submit_exclusive_op(object, op) < 0) @@ -965,13 +962,50 @@ static void fscache_invalidate_object(struct fscache_object *object) /* We can allow read and write requests to come in once again. They'll * queue up behind our exclusive invalidation operation. */ - fscache_invalidation_complete(cookie); - _leave(""); - return; + if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); + _leave(" [ok]"); + return transit_to(UPDATE_OBJECT); + +nomem: + clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_unuse_cookie(object); + _leave(" [ENOMEM]"); + return transit_to(KILL_OBJECT); submit_op_failed: + clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); spin_unlock(&cookie->lock); kfree(op); - fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); _leave(" [EIO]"); + return transit_to(KILL_OBJECT); +} + +static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object, + int event) +{ + const struct fscache_state *s; + + fscache_stat(&fscache_n_invalidates_run); + fscache_stat(&fscache_n_cop_invalidate_object); + s = _fscache_invalidate_object(object, event); + fscache_stat_d(&fscache_n_cop_invalidate_object); + return s; +} + +/* + * Asynchronously update an object. + */ +static const struct fscache_state *fscache_update_object(struct fscache_object *object, + int event) +{ + _enter("{OBJ%x},%d", object->debug_id, event); + + fscache_stat(&fscache_n_updates_run); + fscache_stat(&fscache_n_cop_update_object); + object->cache->ops->update_object(object); + fscache_stat_d(&fscache_n_cop_update_object); + + _leave(""); + return transit_to(WAIT_FOR_CMD); } diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 762a9ec4ffa4..318071aca217 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -35,7 +35,7 @@ void fscache_enqueue_operation(struct fscache_operation *op) ASSERT(list_empty(&op->pend_link)); ASSERT(op->processor != NULL); - ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); + ASSERT(fscache_object_is_available(op->object)); ASSERTCMP(atomic_read(&op->usage), >, 0); ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS); @@ -119,7 +119,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object, /* need to issue a new write op after this */ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); ret = 0; - } else if (object->state == FSCACHE_OBJECT_CREATING) { + } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { op->object = object; object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ @@ -144,7 +144,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object, */ static void fscache_report_unexpected_submission(struct fscache_object *object, struct fscache_operation *op, - unsigned long ostate) + const struct fscache_state *ostate) { static bool once_only; struct fscache_operation *p; @@ -155,11 +155,8 @@ static void fscache_report_unexpected_submission(struct fscache_object *object, once_only = true; kdebug("unexpected submission OP%x [OBJ%x %s]", - op->debug_id, object->debug_id, - fscache_object_states[object->state]); - kdebug("objstate=%s [%s]", - fscache_object_states[object->state], - fscache_object_states[ostate]); + op->debug_id, object->debug_id, object->state->name); + kdebug("objstate=%s [%s]", object->state->name, ostate->name); kdebug("objflags=%lx", object->flags); kdebug("objevent=%lx [%lx]", object->events, object->event_mask); kdebug("ops=%u inp=%u exc=%u", @@ -190,7 +187,7 @@ static void fscache_report_unexpected_submission(struct fscache_object *object, int fscache_submit_op(struct fscache_object *object, struct fscache_operation *op) { - unsigned long ostate; + const struct fscache_state *ostate; int ret; _enter("{OBJ%x OP%x},{%u}", @@ -226,16 +223,14 @@ int fscache_submit_op(struct fscache_object *object, fscache_run_op(object, op); } ret = 0; - } else if (object->state == FSCACHE_OBJECT_CREATING) { + } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { op->object = object; object->n_ops++; atomic_inc(&op->usage); list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; - } else if (object->state == FSCACHE_OBJECT_DYING || - object->state == FSCACHE_OBJECT_LC_DYING || - object->state == FSCACHE_OBJECT_WITHDRAWING) { + } else if (fscache_object_is_dying(object)) { fscache_stat(&fscache_n_op_rejected); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; @@ -265,8 +260,8 @@ void fscache_abort_object(struct fscache_object *object) } /* - * jump start the operation processing on an object - * - caller must hold object->lock + * Jump start the operation processing on an object. The caller must hold + * object->lock. */ void fscache_start_operations(struct fscache_object *object) { @@ -428,14 +423,10 @@ void fscache_put_operation(struct fscache_operation *op) object = op->object; - if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) { - if (atomic_dec_and_test(&object->n_reads)) { - clear_bit(FSCACHE_COOKIE_WAITING_ON_READS, - &object->cookie->flags); - wake_up_bit(&object->cookie->flags, - FSCACHE_COOKIE_WAITING_ON_READS); - } - } + if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) + atomic_dec(&object->n_reads); + if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags)) + fscache_unuse_cookie(object); /* now... we may get called with the object spinlock held, so we * complete the cleanup here only if we can immediately acquire the diff --git a/fs/fscache/page.c b/fs/fscache/page.c index ff000e52072d..d479ab3c63e4 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -109,7 +109,7 @@ page_busy: * allocator as the work threads writing to the cache may all end up * sleeping on memory allocation, so we may need to impose a timeout * too. */ - if (!(gfp & __GFP_WAIT)) { + if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) { fscache_stat(&fscache_n_store_vmscan_busy); return false; } @@ -163,10 +163,12 @@ static void fscache_attr_changed_op(struct fscache_operation *op) fscache_stat(&fscache_n_attr_changed_calls); - if (fscache_object_is_active(object)) { + if (fscache_object_is_active(object) && + fscache_use_cookie(object)) { fscache_stat(&fscache_n_cop_attr_changed); ret = object->cache->ops->attr_changed(object); fscache_stat_d(&fscache_n_cop_attr_changed); + fscache_unuse_cookie(object); if (ret < 0) fscache_abort_object(object); } @@ -233,7 +235,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op) _enter("{OP%x}", op->op.debug_id); - ASSERTCMP(op->n_pages, ==, 0); + ASSERTCMP(atomic_read(&op->n_pages), ==, 0); fscache_hist(fscache_retrieval_histogram, op->start_time); if (op->context) @@ -246,6 +248,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op) * allocate a retrieval op */ static struct fscache_retrieval *fscache_alloc_retrieval( + struct fscache_cookie *cookie, struct address_space *mapping, fscache_rw_complete_t end_io_func, void *context) @@ -260,7 +263,10 @@ static struct fscache_retrieval *fscache_alloc_retrieval( } fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); - op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING); + atomic_inc(&cookie->n_active); + op->op.flags = FSCACHE_OP_MYTHREAD | + (1UL << FSCACHE_OP_WAITING) | + (1UL << FSCACHE_OP_UNUSE_COOKIE); op->mapping = mapping; op->end_io_func = end_io_func; op->context = context; @@ -310,7 +316,7 @@ static void fscache_do_cancel_retrieval(struct fscache_operation *_op) struct fscache_retrieval *op = container_of(_op, struct fscache_retrieval, op); - op->n_pages = 0; + atomic_set(&op->n_pages, 0); } /* @@ -394,12 +400,13 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, if (fscache_wait_for_deferred_lookup(cookie) < 0) return -ERESTARTSYS; - op = fscache_alloc_retrieval(page->mapping, end_io_func, context); + op = fscache_alloc_retrieval(cookie, page->mapping, + end_io_func,context); if (!op) { _leave(" = -ENOMEM"); return -ENOMEM; } - op->n_pages = 1; + atomic_set(&op->n_pages, 1); spin_lock(&cookie->lock); @@ -408,7 +415,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); - ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP); + ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)); atomic_inc(&object->n_reads); __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); @@ -465,6 +472,7 @@ nobufs_unlock_dec: atomic_dec(&object->n_reads); nobufs_unlock: spin_unlock(&cookie->lock); + atomic_dec(&cookie->n_active); kfree(op); nobufs: fscache_stat(&fscache_n_retrievals_nobufs); @@ -522,10 +530,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, if (fscache_wait_for_deferred_lookup(cookie) < 0) return -ERESTARTSYS; - op = fscache_alloc_retrieval(mapping, end_io_func, context); + op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context); if (!op) return -ENOMEM; - op->n_pages = *nr_pages; + atomic_set(&op->n_pages, *nr_pages); spin_lock(&cookie->lock); @@ -589,6 +597,7 @@ nobufs_unlock_dec: atomic_dec(&object->n_reads); nobufs_unlock: spin_unlock(&cookie->lock); + atomic_dec(&cookie->n_active); kfree(op); nobufs: fscache_stat(&fscache_n_retrievals_nobufs); @@ -631,10 +640,10 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, if (fscache_wait_for_deferred_lookup(cookie) < 0) return -ERESTARTSYS; - op = fscache_alloc_retrieval(page->mapping, NULL, NULL); + op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL); if (!op) return -ENOMEM; - op->n_pages = 1; + atomic_set(&op->n_pages, 1); spin_lock(&cookie->lock); @@ -675,6 +684,7 @@ error: nobufs_unlock: spin_unlock(&cookie->lock); + atomic_dec(&cookie->n_active); kfree(op); nobufs: fscache_stat(&fscache_n_allocs_nobufs); @@ -729,8 +739,9 @@ static void fscache_write_op(struct fscache_operation *_op) */ spin_unlock(&object->lock); fscache_op_complete(&op->op, false); - _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}", - _op->flags, _op->state, object->state, object->flags); + _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}", + _op->flags, _op->state, object->state->short_name, + object->flags); return; } @@ -796,11 +807,16 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) _enter(""); - while (spin_lock(&cookie->stores_lock), - n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, - ARRAY_SIZE(results), - FSCACHE_COOKIE_PENDING_TAG), - n > 0) { + for (;;) { + spin_lock(&cookie->stores_lock); + n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, + ARRAY_SIZE(results), + FSCACHE_COOKIE_PENDING_TAG); + if (n == 0) { + spin_unlock(&cookie->stores_lock); + break; + } + for (i = n - 1; i >= 0; i--) { page = results[i]; radix_tree_delete(&cookie->stores, page->index); @@ -812,7 +828,6 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) page_cache_release(results[i]); } - spin_unlock(&cookie->stores_lock); _leave(""); } @@ -829,14 +844,12 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is * set) * - * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred - * fill op) + * (a) no writes yet * * (b) writes deferred till post-creation (mark page for writing and * return immediately) * * (2) negative lookup, object created, initial fill being made from netfs - * (FSCACHE_COOKIE_INITIAL_FILL is set) * * (a) fill point not yet reached this page (mark page for writing and * return) @@ -873,7 +886,9 @@ int __fscache_write_page(struct fscache_cookie *cookie, fscache_operation_init(&op->op, fscache_write_op, fscache_release_write_op); - op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING); + op->op.flags = FSCACHE_OP_ASYNC | + (1 << FSCACHE_OP_WAITING) | + (1 << FSCACHE_OP_UNUSE_COOKIE); ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); if (ret < 0) @@ -919,6 +934,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); op->store_limit = object->store_limit; + atomic_inc(&cookie->n_active); if (fscache_submit_op(object, &op->op) < 0) goto submit_failed; @@ -945,6 +961,7 @@ already_pending: return 0; submit_failed: + atomic_dec(&cookie->n_active); spin_lock(&cookie->stores_lock); radix_tree_delete(&cookie->stores, page->index); spin_unlock(&cookie->stores_lock); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 8179e8bc4a3d..40d13c70ef51 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -287,5 +287,5 @@ const struct file_operations fscache_stats_fops = { .open = fscache_stats_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = single_release, }; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 6f96a8def147..aef34b1e635e 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -38,6 +38,7 @@ #include <linux/device.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/aio.h> #include <linux/kdev_t.h> #include <linux/kthread.h> #include <linux/list.h> @@ -92,8 +93,9 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count, { loff_t pos = 0; struct iovec iov = { .iov_base = buf, .iov_len = count }; + struct fuse_io_priv io = { .async = 0, .file = file }; - return fuse_direct_io(file, &iov, 1, count, &pos, 0); + return fuse_direct_io(&io, &iov, 1, count, &pos, 0); } static ssize_t cuse_write(struct file *file, const char __user *buf, @@ -101,12 +103,13 @@ static ssize_t cuse_write(struct file *file, const char __user *buf, { loff_t pos = 0; struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + struct fuse_io_priv io = { .async = 0, .file = file }; /* * No locking or generic_write_checks(), the server is * responsible for locking and sanity checks. */ - return fuse_direct_io(file, &iov, 1, count, &pos, 1); + return fuse_direct_io(&io, &iov, 1, count, &pos, 1); } static int cuse_open(struct inode *inode, struct file *file) @@ -422,7 +425,7 @@ static int cuse_send_init(struct cuse_conn *cc) BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); - req = fuse_get_req(fc, 1); + req = fuse_get_req_for_background(fc, 1); if (IS_ERR(req)) { rc = PTR_ERR(req); goto err; @@ -504,7 +507,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) cc->fc.release = cuse_fc_release; cc->fc.connected = 1; - cc->fc.blocked = 0; + cc->fc.initialized = 1; rc = cuse_send_init(cc); if (rc) { fuse_conn_put(&cc->fc); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 11dfa0c3fb46..1d55f9465400 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -19,6 +19,7 @@ #include <linux/pipe_fs_i.h> #include <linux/swap.h> #include <linux/splice.h> +#include <linux/aio.h> MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); @@ -111,7 +112,7 @@ static void restore_sigs(sigset_t *oldset) sigprocmask(SIG_SETMASK, oldset, NULL); } -static void __fuse_get_request(struct fuse_req *req) +void __fuse_get_request(struct fuse_req *req) { atomic_inc(&req->count); } @@ -130,20 +131,30 @@ static void fuse_req_init_context(struct fuse_req *req) req->in.h.pid = current->pid; } -struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) +static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) +{ + return !fc->initialized || (for_background && fc->blocked); +} + +static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, + bool for_background) { struct fuse_req *req; - sigset_t oldset; - int intr; int err; - atomic_inc(&fc->num_waiting); - block_sigs(&oldset); - intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); - restore_sigs(&oldset); - err = -EINTR; - if (intr) - goto out; + + if (fuse_block_alloc(fc, for_background)) { + sigset_t oldset; + int intr; + + block_sigs(&oldset); + intr = wait_event_interruptible_exclusive(fc->blocked_waitq, + !fuse_block_alloc(fc, for_background)); + restore_sigs(&oldset); + err = -EINTR; + if (intr) + goto out; + } err = -ENOTCONN; if (!fc->connected) @@ -151,19 +162,35 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) req = fuse_request_alloc(npages); err = -ENOMEM; - if (!req) + if (!req) { + if (for_background) + wake_up(&fc->blocked_waitq); goto out; + } fuse_req_init_context(req); req->waiting = 1; + req->background = for_background; return req; out: atomic_dec(&fc->num_waiting); return ERR_PTR(err); } + +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) +{ + return __fuse_get_req(fc, npages, false); +} EXPORT_SYMBOL_GPL(fuse_get_req); +struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, + unsigned npages) +{ + return __fuse_get_req(fc, npages, true); +} +EXPORT_SYMBOL_GPL(fuse_get_req_for_background); + /* * Return request in fuse_file->reserved_req. However that may * currently be in use. If that is the case, wait for it to become @@ -225,19 +252,31 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, struct fuse_req *req; atomic_inc(&fc->num_waiting); - wait_event(fc->blocked_waitq, !fc->blocked); + wait_event(fc->blocked_waitq, fc->initialized); req = fuse_request_alloc(0); if (!req) req = get_reserved_req(fc, file); fuse_req_init_context(req); req->waiting = 1; + req->background = 0; return req; } void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { + if (unlikely(req->background)) { + /* + * We get here in the unlikely case that a background + * request was allocated but not sent + */ + spin_lock(&fc->lock); + if (!fc->blocked) + wake_up(&fc->blocked_waitq); + spin_unlock(&fc->lock); + } + if (req->waiting) atomic_dec(&fc->num_waiting); @@ -335,10 +374,15 @@ __releases(fc->lock) list_del(&req->intr_entry); req->state = FUSE_REQ_FINISHED; if (req->background) { - if (fc->num_background == fc->max_background) { + req->background = 0; + + if (fc->num_background == fc->max_background) fc->blocked = 0; - wake_up_all(&fc->blocked_waitq); - } + + /* Wake up next waiter, if any */ + if (!fc->blocked && waitqueue_active(&fc->blocked_waitq)) + wake_up(&fc->blocked_waitq); + if (fc->num_background == fc->congestion_threshold && fc->connected && fc->bdi_initialized) { clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); @@ -442,6 +486,7 @@ __acquires(fc->lock) static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { + BUG_ON(req->background); spin_lock(&fc->lock); if (!fc->connected) req->out.h.error = -ENOTCONN; @@ -469,7 +514,7 @@ EXPORT_SYMBOL_GPL(fuse_request_send); static void fuse_request_send_nowait_locked(struct fuse_conn *fc, struct fuse_req *req) { - req->background = 1; + BUG_ON(!req->background); fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; @@ -1319,7 +1364,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, page_nr++; ret += buf->len; - if (pipe->inode) + if (pipe->files) do_wakeup = 1; } @@ -2071,6 +2116,7 @@ void fuse_abort_conn(struct fuse_conn *fc) if (fc->connected) { fc->connected = 0; fc->blocked = 0; + fc->initialized = 1; end_io_requests(fc); end_queued_requests(fc); end_polls(fc); @@ -2089,6 +2135,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) spin_lock(&fc->lock); fc->connected = 0; fc->blocked = 0; + fc->initialized = 1; end_queued_requests(fc); end_polls(fc); wake_up_all(&fc->blocked_waitq); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ff15522481d4..0eda52738ec4 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -14,7 +14,7 @@ #include <linux/namei.h> #include <linux/slab.h> -static bool fuse_use_readdirplus(struct inode *dir, struct file *filp) +static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) { struct fuse_conn *fc = get_fuse_conn(dir); struct fuse_inode *fi = get_fuse_inode(dir); @@ -25,7 +25,7 @@ static bool fuse_use_readdirplus(struct inode *dir, struct file *filp) return true; if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) return true; - if (filp->f_pos == 0) + if (ctx->pos == 0) return true; return false; } @@ -180,6 +180,8 @@ u64 fuse_get_attr_version(struct fuse_conn *fc) static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) { struct inode *inode; + struct dentry *parent; + struct fuse_conn *fc; inode = ACCESS_ONCE(entry->d_inode); if (inode && is_bad_inode(inode)) @@ -187,10 +189,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) else if (fuse_dentry_time(entry) < get_jiffies_64()) { int err; struct fuse_entry_out outarg; - struct fuse_conn *fc; struct fuse_req *req; struct fuse_forget_link *forget; - struct dentry *parent; u64 attr_version; /* For negative dentries, always do a fresh lookup */ @@ -241,8 +241,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) entry_attr_timeout(&outarg), attr_version); fuse_change_entry_timeout(entry, &outarg); + } else if (inode) { + fc = get_fuse_conn(inode); + if (fc->readdirplus_auto) { + parent = dget_parent(entry); + fuse_advise_use_readdirplus(parent->d_inode); + dput(parent); + } } - fuse_advise_use_readdirplus(inode); return 1; } @@ -1159,25 +1165,23 @@ static int fuse_permission(struct inode *inode, int mask) } static int parse_dirfile(char *buf, size_t nbytes, struct file *file, - void *dstbuf, filldir_t filldir) + struct dir_context *ctx) { while (nbytes >= FUSE_NAME_OFFSET) { struct fuse_dirent *dirent = (struct fuse_dirent *) buf; size_t reclen = FUSE_DIRENT_SIZE(dirent); - int over; if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) return -EIO; if (reclen > nbytes) break; - over = filldir(dstbuf, dirent->name, dirent->namelen, - file->f_pos, dirent->ino, dirent->type); - if (over) + if (!dir_emit(ctx, dirent->name, dirent->namelen, + dirent->ino, dirent->type)) break; buf += reclen; nbytes -= reclen; - file->f_pos = dirent->off; + ctx->pos = dirent->off; } return 0; @@ -1278,7 +1282,7 @@ out: } static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, - void *dstbuf, filldir_t filldir, u64 attr_version) + struct dir_context *ctx, u64 attr_version) { struct fuse_direntplus *direntplus; struct fuse_dirent *dirent; @@ -1303,10 +1307,9 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, we need to send a FORGET for each of those which we did not link. */ - over = filldir(dstbuf, dirent->name, dirent->namelen, - file->f_pos, dirent->ino, - dirent->type); - file->f_pos = dirent->off; + over = !dir_emit(ctx, dirent->name, dirent->namelen, + dirent->ino, dirent->type); + ctx->pos = dirent->off; } buf += reclen; @@ -1320,7 +1323,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, return 0; } -static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) +static int fuse_readdir(struct file *file, struct dir_context *ctx) { int plus, err; size_t nbytes; @@ -1343,17 +1346,17 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) return -ENOMEM; } - plus = fuse_use_readdirplus(inode, file); + plus = fuse_use_readdirplus(inode, ctx); req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; req->page_descs[0].length = PAGE_SIZE; if (plus) { attr_version = fuse_get_attr_version(fc); - fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, + fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, FUSE_READDIRPLUS); } else { - fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, + fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, FUSE_READDIR); } fuse_request_send(fc, req); @@ -1363,11 +1366,11 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) if (!err) { if (plus) { err = parse_dirplusfile(page_address(page), nbytes, - file, dstbuf, filldir, + file, ctx, attr_version); } else { err = parse_dirfile(page_address(page), nbytes, file, - dstbuf, filldir); + ctx); } } @@ -1562,10 +1565,9 @@ void fuse_release_nowrite(struct inode *inode) * vmtruncate() doesn't allow for this case, so do the rlimit checking * and the actual truncation by hand. */ -static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, - struct file *file) +int fuse_do_setattr(struct inode *inode, struct iattr *attr, + struct file *file) { - struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; struct fuse_setattr_in inarg; @@ -1574,9 +1576,6 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, loff_t oldsize; int err; - if (!fuse_allow_current_process(fc)) - return -EACCES; - if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) attr->ia_valid |= ATTR_FORCE; @@ -1671,10 +1670,15 @@ error: static int fuse_setattr(struct dentry *entry, struct iattr *attr) { + struct inode *inode = entry->d_inode; + + if (!fuse_allow_current_process(get_fuse_conn(inode))) + return -EACCES; + if (attr->ia_valid & ATTR_FILE) - return fuse_do_setattr(entry, attr, attr->ia_file); + return fuse_do_setattr(inode, attr, attr->ia_file); else - return fuse_do_setattr(entry, attr, NULL); + return fuse_do_setattr(inode, attr, NULL); } static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, @@ -1879,7 +1883,7 @@ static const struct inode_operations fuse_dir_inode_operations = { static const struct file_operations fuse_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = fuse_readdir, + .iterate = fuse_readdir, .open = fuse_dir_open, .release = fuse_dir_release, .fsync = fuse_dir_fsync, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 34b80ba95bad..5c121fe19c5f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -15,6 +15,8 @@ #include <linux/module.h> #include <linux/compat.h> #include <linux/swap.h> +#include <linux/aio.h> +#include <linux/falloc.h> static const struct file_operations fuse_direct_io_file_operations; @@ -126,11 +128,13 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) struct fuse_req *req = ff->reserved_req; if (sync) { + req->background = 0; fuse_request_send(ff->fc, req); path_put(&req->misc.release.path); fuse_put_request(ff->fc, req); } else { req->end = fuse_release_end; + req->background = 1; fuse_request_send_background(ff->fc, req); } kfree(ff); @@ -282,6 +286,7 @@ void fuse_sync_release(struct fuse_file *ff, int flags) WARN_ON(atomic_read(&ff->count) > 1); fuse_prepare_release(ff, flags, FUSE_RELEASE); ff->reserved_req->force = 1; + ff->reserved_req->background = 0; fuse_request_send(ff->fc, ff->reserved_req); fuse_put_request(ff->fc, ff->reserved_req); kfree(ff); @@ -491,9 +496,114 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, req->out.args[0].size = count; } -static size_t fuse_send_read(struct fuse_req *req, struct file *file, +static void fuse_release_user_pages(struct fuse_req *req, int write) +{ + unsigned i; + + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + if (write) + set_page_dirty_lock(page); + put_page(page); + } +} + +/** + * In case of short read, the caller sets 'pos' to the position of + * actual end of fuse request in IO request. Otherwise, if bytes_requested + * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. + * + * An example: + * User requested DIO read of 64K. It was splitted into two 32K fuse requests, + * both submitted asynchronously. The first of them was ACKed by userspace as + * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The + * second request was ACKed as short, e.g. only 1K was read, resulting in + * pos == 33K. + * + * Thus, when all fuse requests are completed, the minimal non-negative 'pos' + * will be equal to the length of the longest contiguous fragment of + * transferred data starting from the beginning of IO request. + */ +static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) +{ + int left; + + spin_lock(&io->lock); + if (err) + io->err = io->err ? : err; + else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes)) + io->bytes = pos; + + left = --io->reqs; + spin_unlock(&io->lock); + + if (!left) { + long res; + + if (io->err) + res = io->err; + else if (io->bytes >= 0 && io->write) + res = -EIO; + else { + res = io->bytes < 0 ? io->size : io->bytes; + + if (!is_sync_kiocb(io->iocb)) { + struct inode *inode = file_inode(io->iocb->ki_filp); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + spin_unlock(&fc->lock); + } + } + + aio_complete(io->iocb, res, 0); + kfree(io); + } +} + +static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) +{ + struct fuse_io_priv *io = req->io; + ssize_t pos = -1; + + fuse_release_user_pages(req, !io->write); + + if (io->write) { + if (req->misc.write.in.size != req->misc.write.out.size) + pos = req->misc.write.in.offset - io->offset + + req->misc.write.out.size; + } else { + if (req->misc.read.in.size != req->out.args[0].size) + pos = req->misc.read.in.offset - io->offset + + req->out.args[0].size; + } + + fuse_aio_complete(io, req->out.h.error, pos); +} + +static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req, + size_t num_bytes, struct fuse_io_priv *io) +{ + spin_lock(&io->lock); + io->size += num_bytes; + io->reqs++; + spin_unlock(&io->lock); + + req->io = io; + req->end = fuse_aio_complete_req; + + __fuse_get_request(req); + fuse_request_send_background(fc, req); + + return num_bytes; +} + +static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io, loff_t pos, size_t count, fl_owner_t owner) { + struct file *file = io->file; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -504,6 +614,10 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file, inarg->read_flags |= FUSE_READ_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); } + + if (io->async) + return fuse_async_req_send(fc, req, count, io); + fuse_request_send(fc, req); return req->out.args[0].size; } @@ -524,6 +638,7 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, static int fuse_readpage(struct file *file, struct page *page) { + struct fuse_io_priv io = { .async = 0, .file = file }; struct inode *inode = page->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; @@ -556,7 +671,7 @@ static int fuse_readpage(struct file *file, struct page *page) req->num_pages = 1; req->pages[0] = page; req->page_descs[0].length = count; - num_read = fuse_send_read(req, file, pos, count, NULL); + num_read = fuse_send_read(req, &io, pos, count, NULL); err = req->out.h.error; fuse_put_request(fc, req); @@ -661,7 +776,12 @@ static int fuse_readpages_fill(void *_data, struct page *page) int nr_alloc = min_t(unsigned, data->nr_pages, FUSE_MAX_PAGES_PER_REQ); fuse_send_readpages(req, data->file); - data->req = req = fuse_get_req(fc, nr_alloc); + if (fc->async_read) + req = fuse_get_req_for_background(fc, nr_alloc); + else + req = fuse_get_req(fc, nr_alloc); + + data->req = req; if (IS_ERR(req)) { unlock_page(page); return PTR_ERR(req); @@ -696,7 +816,10 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, data.file = file; data.inode = inode; - data.req = fuse_get_req(fc, nr_alloc); + if (fc->async_read) + data.req = fuse_get_req_for_background(fc, nr_alloc); + else + data.req = fuse_get_req(fc, nr_alloc); data.nr_pages = nr_pages; err = PTR_ERR(data.req); if (IS_ERR(data.req)) @@ -758,9 +881,10 @@ static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, req->out.args[0].value = outarg; } -static size_t fuse_send_write(struct fuse_req *req, struct file *file, +static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, loff_t pos, size_t count, fl_owner_t owner) { + struct file *file = io->file; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; struct fuse_write_in *inarg = &req->misc.write.in; @@ -771,6 +895,10 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file, inarg->write_flags |= FUSE_WRITE_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); } + + if (io->async) + return fuse_async_req_send(fc, req, count, io); + fuse_request_send(fc, req); return req->misc.write.out.size; } @@ -794,11 +922,12 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, size_t res; unsigned offset; unsigned i; + struct fuse_io_priv io = { .async = 0, .file = file }; for (i = 0; i < req->num_pages; i++) fuse_wait_on_page_writeback(inode, req->pages[i]->index); - res = fuse_send_write(req, file, pos, count, NULL); + res = fuse_send_write(req, &io, pos, count, NULL); offset = req->page_descs[0].offset; count = res; @@ -971,7 +1100,6 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return err; count = ocount; - sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ @@ -1030,23 +1158,10 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, out: current->backing_dev_info = NULL; mutex_unlock(&inode->i_mutex); - sb_end_write(inode->i_sb); return written ? written : err; } -static void fuse_release_user_pages(struct fuse_req *req, int write) -{ - unsigned i; - - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; - if (write) - set_page_dirty_lock(page); - put_page(page); - } -} - static inline void fuse_page_descs_length_init(struct fuse_req *req, unsigned index, unsigned nr_pages) { @@ -1148,10 +1263,11 @@ static inline int fuse_iter_npages(const struct iov_iter *ii_p) return min(npages, FUSE_MAX_PAGES_PER_REQ); } -ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, +ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, unsigned long nr_segs, size_t count, loff_t *ppos, int write) { + struct file *file = io->file; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; size_t nmax = write ? fc->max_write : fc->max_read; @@ -1162,7 +1278,10 @@ ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, iov_iter_init(&ii, iov, nr_segs, count, 0); - req = fuse_get_req(fc, fuse_iter_npages(&ii)); + if (io->async) + req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii)); + else + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) return PTR_ERR(req); @@ -1177,11 +1296,12 @@ ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, } if (write) - nres = fuse_send_write(req, file, pos, nbytes, owner); + nres = fuse_send_write(req, io, pos, nbytes, owner); else - nres = fuse_send_read(req, file, pos, nbytes, owner); + nres = fuse_send_read(req, io, pos, nbytes, owner); - fuse_release_user_pages(req, !write); + if (!io->async) + fuse_release_user_pages(req, !write); if (req->out.h.error) { if (!res) res = req->out.h.error; @@ -1197,7 +1317,11 @@ ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, break; if (count) { fuse_put_request(fc, req); - req = fuse_get_req(fc, fuse_iter_npages(&ii)); + if (io->async) + req = fuse_get_req_for_background(fc, + fuse_iter_npages(&ii)); + else + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) break; } @@ -1211,17 +1335,19 @@ ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, } EXPORT_SYMBOL_GPL(fuse_direct_io); -static ssize_t __fuse_direct_read(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +static ssize_t __fuse_direct_read(struct fuse_io_priv *io, + const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos, + size_t count) { ssize_t res; + struct file *file = io->file; struct inode *inode = file_inode(file); if (is_bad_inode(inode)) return -EIO; - res = fuse_direct_io(file, iov, nr_segs, iov_length(iov, nr_segs), - ppos, 0); + res = fuse_direct_io(io, iov, nr_segs, count, ppos, 0); fuse_invalidate_attr(inode); @@ -1231,23 +1357,23 @@ static ssize_t __fuse_direct_read(struct file *file, const struct iovec *iov, static ssize_t fuse_direct_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + struct fuse_io_priv io = { .async = 0, .file = file }; struct iovec iov = { .iov_base = buf, .iov_len = count }; - return __fuse_direct_read(file, &iov, 1, ppos); + return __fuse_direct_read(&io, &iov, 1, ppos, count); } -static ssize_t __fuse_direct_write(struct file *file, const struct iovec *iov, +static ssize_t __fuse_direct_write(struct fuse_io_priv *io, + const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { + struct file *file = io->file; struct inode *inode = file_inode(file); size_t count = iov_length(iov, nr_segs); ssize_t res; res = generic_write_checks(file, ppos, &count, 0); - if (!res) { - res = fuse_direct_io(file, iov, nr_segs, count, ppos, 1); - if (res > 0) - fuse_write_update_size(inode, *ppos); - } + if (!res) + res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1); fuse_invalidate_attr(inode); @@ -1260,13 +1386,16 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; struct inode *inode = file_inode(file); ssize_t res; + struct fuse_io_priv io = { .async = 0, .file = file }; if (is_bad_inode(inode)) return -EIO; /* Don't allow parallel writes to the same file */ mutex_lock(&inode->i_mutex); - res = __fuse_direct_write(file, &iov, 1, ppos); + res = __fuse_direct_write(&io, &iov, 1, ppos); + if (res > 0) + fuse_write_update_size(inode, *ppos); mutex_unlock(&inode->i_mutex); return res; @@ -1375,6 +1504,7 @@ static int fuse_writepage_locked(struct page *page) if (!req) goto err; + req->background = 1; /* writeback always goes to bg_queue */ tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!tmp_page) goto err_free; @@ -2228,21 +2358,99 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc, return 0; } +static void fuse_do_truncate(struct file *file) +{ + struct inode *inode = file->f_mapping->host; + struct iattr attr; + + attr.ia_valid = ATTR_SIZE; + attr.ia_size = i_size_read(inode); + + attr.ia_file = file; + attr.ia_valid |= ATTR_FILE; + + fuse_do_setattr(inode, &attr, file); +} + +static inline loff_t fuse_round_up(loff_t off) +{ + return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); +} + static ssize_t fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { ssize_t ret = 0; - struct file *file = NULL; + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + bool async_dio = ff->fc->async_dio; loff_t pos = 0; + struct inode *inode; + loff_t i_size; + size_t count = iov_length(iov, nr_segs); + struct fuse_io_priv *io; - file = iocb->ki_filp; pos = offset; + inode = file->f_mapping->host; + i_size = i_size_read(inode); + + /* optimization for short read */ + if (async_dio && rw != WRITE && offset + count > i_size) { + if (offset >= i_size) + return 0; + count = min_t(loff_t, count, fuse_round_up(i_size - offset)); + } + + io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); + if (!io) + return -ENOMEM; + spin_lock_init(&io->lock); + io->reqs = 1; + io->bytes = -1; + io->size = 0; + io->offset = offset; + io->write = (rw == WRITE); + io->err = 0; + io->file = file; + /* + * By default, we want to optimize all I/Os with async request + * submission to the client filesystem if supported. + */ + io->async = async_dio; + io->iocb = iocb; + + /* + * We cannot asynchronously extend the size of a file. We have no method + * to wait on real async I/O requests, so we must submit this request + * synchronously. + */ + if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE) + io->async = false; if (rw == WRITE) - ret = __fuse_direct_write(file, iov, nr_segs, &pos); + ret = __fuse_direct_write(io, iov, nr_segs, &pos); else - ret = __fuse_direct_read(file, iov, nr_segs, &pos); + ret = __fuse_direct_read(io, iov, nr_segs, &pos, count); + + if (io->async) { + fuse_aio_complete(io, ret < 0 ? ret : 0, -1); + + /* we have a non-extending, async request, so return */ + if (!is_sync_kiocb(iocb)) + return -EIOCBQUEUED; + + ret = wait_on_sync_kiocb(iocb); + } else { + kfree(io); + } + + if (rw == WRITE) { + if (ret > 0) + fuse_write_update_size(inode, pos); + else if (ret < 0 && offset + count > i_size) + fuse_do_truncate(file); + } return ret; } @@ -2251,6 +2459,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, loff_t length) { struct fuse_file *ff = file->private_data; + struct inode *inode = file->f_inode; struct fuse_conn *fc = ff->fc; struct fuse_req *req; struct fuse_fallocate_in inarg = { @@ -2260,13 +2469,23 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, .mode = mode }; int err; + bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || + (mode & FALLOC_FL_PUNCH_HOLE); if (fc->no_fallocate) return -EOPNOTSUPP; + if (lock_inode) { + mutex_lock(&inode->i_mutex); + if (mode & FALLOC_FL_PUNCH_HOLE) + fuse_set_nowrite(inode); + } + req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } req->in.h.opcode = FUSE_FALLOCATE; req->in.h.nodeid = ff->nodeid; @@ -2281,6 +2500,25 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, } fuse_put_request(fc, req); + if (err) + goto out; + + /* we could have extended the file */ + if (!(mode & FALLOC_FL_KEEP_SIZE)) + fuse_write_update_size(inode, offset + length); + + if (mode & FALLOC_FL_PUNCH_HOLE) + truncate_pagecache_range(inode, offset, offset + length - 1); + + fuse_invalidate_attr(inode); + +out: + if (lock_inode) { + if (mode & FALLOC_FL_PUNCH_HOLE) + fuse_release_nowrite(inode); + mutex_unlock(&inode->i_mutex); + } + return err; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 6aeba864f070..fde7249a3a96 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -228,6 +228,20 @@ enum fuse_req_state { FUSE_REQ_FINISHED }; +/** The request IO state (for asynchronous processing) */ +struct fuse_io_priv { + int async; + spinlock_t lock; + unsigned reqs; + ssize_t bytes; + size_t size; + __u64 offset; + bool write; + int err; + struct kiocb *iocb; + struct file *file; +}; + /** * A request to the client */ @@ -332,6 +346,9 @@ struct fuse_req { /** Inode used in the request or NULL */ struct inode *inode; + /** AIO control block */ + struct fuse_io_priv *io; + /** Link on fi->writepages */ struct list_head writepages_entry; @@ -417,6 +434,10 @@ struct fuse_conn { /** Batching of FORGET requests (positive indicates FORGET batch) */ int forget_batch; + /** Flag indicating that INIT reply has been received. Allocating + * any fuse request will be suspended until the flag is set */ + int initialized; + /** Flag indicating if connection is blocked. This will be the case before the INIT reply is received, and if there are too many outstading backgrounds requests */ @@ -520,6 +541,9 @@ struct fuse_conn { /** Does the filesystem want adaptive readdirplus? */ unsigned readdirplus_auto:1; + /** Does the filesystem support asynchronous direct-IO submission? */ + unsigned async_dio:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -708,6 +732,13 @@ void fuse_request_free(struct fuse_req *req); * caller should specify # elements in req->pages[] explicitly */ struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages); +struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, + unsigned npages); + +/* + * Increment reference count on request + */ +void __fuse_get_request(struct fuse_req *req); /** * Get a request, may fail with -ENOMEM, @@ -823,7 +854,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, bool isdir); -ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, +ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, unsigned long nr_segs, size_t count, loff_t *ppos, int write); long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, @@ -835,4 +866,7 @@ int fuse_dev_release(struct inode *inode, struct file *file); void fuse_write_update_size(struct inode *inode, loff_t pos); +int fuse_do_setattr(struct inode *inode, struct iattr *attr, + struct file *file); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 137185c3884f..0b578598c6ac 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -346,6 +346,7 @@ static void fuse_send_destroy(struct fuse_conn *fc) fc->destroy_req = NULL; req->in.h.opcode = FUSE_DESTROY; req->force = 1; + req->background = 0; fuse_request_send(fc, req); fuse_put_request(fc, req); } @@ -362,6 +363,7 @@ void fuse_conn_kill(struct fuse_conn *fc) spin_lock(&fc->lock); fc->connected = 0; fc->blocked = 0; + fc->initialized = 1; spin_unlock(&fc->lock); /* Flush all readers on this fs */ kill_fasync(&fc->fasync, SIGIO, POLL_IN); @@ -581,7 +583,8 @@ void fuse_conn_init(struct fuse_conn *fc) fc->khctr = 0; fc->polled_files = RB_ROOT; fc->reqctr = 0; - fc->blocked = 1; + fc->blocked = 0; + fc->initialized = 0; fc->attr_version = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); } @@ -782,7 +785,7 @@ static const struct super_operations fuse_super_operations = { static void sanitize_global_limit(unsigned *limit) { if (*limit == 0) - *limit = ((num_physpages << PAGE_SHIFT) >> 13) / + *limit = ((totalram_pages << PAGE_SHIFT) >> 13) / sizeof(struct fuse_req); if (*limit >= 1 << 16) @@ -864,10 +867,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->dont_mask = 1; if (arg->flags & FUSE_AUTO_INVAL_DATA) fc->auto_inval_data = 1; - if (arg->flags & FUSE_DO_READDIRPLUS) + if (arg->flags & FUSE_DO_READDIRPLUS) { fc->do_readdirplus = 1; - if (arg->flags & FUSE_READDIRPLUS_AUTO) - fc->readdirplus_auto = 1; + if (arg->flags & FUSE_READDIRPLUS_AUTO) + fc->readdirplus_auto = 1; + } + if (arg->flags & FUSE_ASYNC_DIO) + fc->async_dio = 1; } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -880,7 +886,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->max_write = max_t(unsigned, 4096, fc->max_write); fc->conn_init = 1; } - fc->blocked = 0; + fc->initialized = 1; wake_up_all(&fc->blocked_waitq); } @@ -895,7 +901,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | - FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO; + FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1043,6 +1049,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) init_req = fuse_request_alloc(0); if (!init_req) goto err_put_root; + init_req->background = 1; if (is_bdev) { fc->destroy_req = fuse_request_alloc(0); diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index eb08c9e43c2a..90c6a8faaecb 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -20,13 +20,12 @@ config GFS2_FS be found here: http://sources.redhat.com/cluster The "nolock" lock module is now built in to GFS2 by default. If - you want to use the DLM, be sure to enable HOTPLUG and IPv4/6 - networking. + you want to use the DLM, be sure to enable IPv4/6 networking. config GFS2_FS_LOCKING_DLM bool "GFS2 DLM locking" depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \ - HOTPLUG && DLM && CONFIGFS_FS && SYSFS + CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS) help Multiple node locking module for GFS2 diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 24f414f0ce61..ee48ad37d9c0 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -20,6 +20,7 @@ #include <linux/swap.h> #include <linux/gfs2_ondisk.h> #include <linux/backing-dev.h> +#include <linux/aio.h> #include "gfs2.h" #include "incore.h" @@ -109,7 +110,7 @@ static int gfs2_writepage_common(struct page *page, /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_CACHE_SIZE-1); if (page->index > end_index || (page->index == end_index && !offset)) { - page->mapping->a_ops->invalidatepage(page, 0); + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); goto out; } return 1; @@ -298,7 +299,8 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, /* Is the page fully outside i_size? (truncate in progress) */ if (page->index > end_index || (page->index == end_index && !offset)) { - page->mapping->a_ops->invalidatepage(page, 0); + page->mapping->a_ops->invalidatepage(page, 0, + PAGE_CACHE_SIZE); unlock_page(page); continue; } @@ -942,27 +944,33 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) unlock_buffer(bh); } -static void gfs2_invalidatepage(struct page *page, unsigned long offset) +static void gfs2_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + unsigned int stop = offset + length; + int partial_page = (offset || length < PAGE_CACHE_SIZE); struct buffer_head *bh, *head; unsigned long pos = 0; BUG_ON(!PageLocked(page)); - if (offset == 0) + if (!partial_page) ClearPageChecked(page); if (!page_has_buffers(page)) goto out; bh = head = page_buffers(page); do { + if (pos + bh->b_size > stop) + return; + if (offset <= pos) gfs2_discard(sdp, bh); pos += bh->b_size; bh = bh->b_this_page; } while (bh != head); out: - if (offset == 0) + if (!partial_page) try_to_release_page(page, 0); } @@ -1055,7 +1063,7 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) if (atomic_read(&bh->b_count)) goto cannot_release; bd = bh->b_private; - if (bd && bd->bd_ail) + if (bd && bd->bd_tr) goto cannot_release; if (buffer_pinned(bh) || buffer_dirty(bh)) goto not_possible; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5e83657f046e..5e2f56fccf6b 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -787,7 +787,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, goto out_rlist; if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */ - gfs2_rs_deltree(ip, ip->i_res); + gfs2_rs_deltree(ip->i_res); error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT + RES_STATFS + RES_QUOTA, @@ -1232,7 +1232,9 @@ static int do_grow(struct inode *inode, u64 size) unstuff = 1; } - error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0); + error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT + + (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ? + 0 : RES_QUOTA), 0); if (error) goto do_grow_release; @@ -1286,17 +1288,26 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) if (ret) return ret; + ret = get_write_access(inode); + if (ret) + return ret; + inode_dio_wait(inode); ret = gfs2_rs_alloc(GFS2_I(inode)); if (ret) - return ret; + goto out; oldsize = inode->i_size; - if (newsize >= oldsize) - return do_grow(inode, newsize); + if (newsize >= oldsize) { + ret = do_grow(inode, newsize); + goto out; + } - return do_shrink(inode, oldsize, newsize); + ret = do_shrink(inode, oldsize, newsize); +out: + put_write_access(inode); + return ret; } int gfs2_truncatei_resume(struct gfs2_inode *ip) diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 4fddb3c22d25..f2448ab2aac5 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -109,8 +109,7 @@ fail: return 0; } -static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode, - struct qstr *str) +static int gfs2_dhash(const struct dentry *dentry, struct qstr *str) { str->hash = gfs2_disk_hash(str->name, str->len); return 0; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c3e82bd23179..0cb4c1557f20 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -354,22 +354,31 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) return ERR_PTR(-EIO); } - hc = kmalloc(hsize, GFP_NOFS); - ret = -ENOMEM; + hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN); + if (hc == NULL) + hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL); + if (hc == NULL) return ERR_PTR(-ENOMEM); ret = gfs2_dir_read_data(ip, hc, hsize); if (ret < 0) { - kfree(hc); + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); return ERR_PTR(ret); } spin_lock(&inode->i_lock); - if (ip->i_hash_cache) - kfree(hc); - else + if (ip->i_hash_cache) { + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); + } else { ip->i_hash_cache = hc; + } spin_unlock(&inode->i_lock); return ip->i_hash_cache; @@ -385,7 +394,10 @@ void gfs2_dir_hash_inval(struct gfs2_inode *ip) { __be64 *hc = ip->i_hash_cache; ip->i_hash_cache = NULL; - kfree(hc); + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); } static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent) @@ -1113,10 +1125,14 @@ static int dir_double_exhash(struct gfs2_inode *dip) if (IS_ERR(hc)) return PTR_ERR(hc); - h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS); + hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN); + if (hc2 == NULL) + hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); + if (!hc2) return -ENOMEM; + h = hc2; error = gfs2_meta_inode_buffer(dip, &dibh); if (error) goto out_kfree; @@ -1145,7 +1161,10 @@ fail: gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); out_kfree: - kfree(hc2); + if (is_vmalloc_addr(hc2)) + vfree(hc2); + else + kfree(hc2); return error; } @@ -1194,9 +1213,7 @@ static int compare_dents(const void *a, const void *b) /** * do_filldir_main - read out directory entries * @dip: The GFS2 inode - * @offset: The offset in the file to read from - * @opaque: opaque data to pass to filldir - * @filldir: The function to pass entries to + * @ctx: what to feed the entries to * @darr: an array of struct gfs2_dirent pointers to read * @entries: the number of entries in darr * @copied: pointer to int that's non-zero if a entry has been copied out @@ -1206,11 +1223,10 @@ static int compare_dents(const void *a, const void *b) * the possibility that they will fall into different readdir buffers or * that someone will want to seek to that location. * - * Returns: errno, >0 on exception from filldir + * Returns: errno, >0 if the actor tells you to stop */ -static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, - void *opaque, filldir_t filldir, +static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx, const struct gfs2_dirent **darr, u32 entries, int *copied) { @@ -1218,7 +1234,6 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, u64 off, off_next; unsigned int x, y; int run = 0; - int error = 0; sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL); @@ -1235,9 +1250,9 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, off_next = be32_to_cpu(dent_next->de_hash); off_next = gfs2_disk_hash2offset(off_next); - if (off < *offset) + if (off < ctx->pos) continue; - *offset = off; + ctx->pos = off; if (off_next == off) { if (*copied && !run) @@ -1246,26 +1261,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, } else run = 0; } else { - if (off < *offset) + if (off < ctx->pos) continue; - *offset = off; + ctx->pos = off; } - error = filldir(opaque, (const char *)(dent + 1), + if (!dir_emit(ctx, (const char *)(dent + 1), be16_to_cpu(dent->de_name_len), - off, be64_to_cpu(dent->de_inum.no_addr), - be16_to_cpu(dent->de_type)); - if (error) + be64_to_cpu(dent->de_inum.no_addr), + be16_to_cpu(dent->de_type))) return 1; *copied = 1; } - /* Increment the *offset by one, so the next time we come into the + /* Increment the ctx->pos by one, so the next time we come into the do_filldir fxn, we get the next entry instead of the last one in the current leaf */ - (*offset)++; + ctx->pos++; return 0; } @@ -1289,8 +1303,8 @@ static void gfs2_free_sort_buffer(void *ptr) kfree(ptr); } -static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir, int *copied, unsigned *depth, +static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, + int *copied, unsigned *depth, u64 leaf_no) { struct gfs2_inode *ip = GFS2_I(inode); @@ -1368,8 +1382,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, } while(lfn); BUG_ON(entries2 != entries); - error = do_filldir_main(ip, offset, opaque, filldir, darr, - entries, copied); + error = do_filldir_main(ip, ctx, darr, entries, copied); out_free: for(i = 0; i < leaf; i++) brelse(larr[i]); @@ -1428,15 +1441,13 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, /** * dir_e_read - Reads the entries from a directory into a filldir buffer * @dip: dinode pointer - * @offset: the hash of the last entry read shifted to the right once - * @opaque: buffer for the filldir function to fill - * @filldir: points to the filldir function to use + * @ctx: actor to feed the entries to * * Returns: errno */ -static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir, struct file_ra_state *f_ra) +static int dir_e_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra) { struct gfs2_inode *dip = GFS2_I(inode); u32 hsize, len = 0; @@ -1447,7 +1458,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, unsigned depth = 0; hsize = 1 << dip->i_depth; - hash = gfs2_dir_offset2hash(*offset); + hash = gfs2_dir_offset2hash(ctx->pos); index = hash >> (32 - dip->i_depth); if (dip->i_hash_cache == NULL) @@ -1459,7 +1470,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, gfs2_dir_readahead(inode, hsize, index, f_ra); while (index < hsize) { - error = gfs2_dir_read_leaf(inode, offset, opaque, filldir, + error = gfs2_dir_read_leaf(inode, ctx, &copied, &depth, be64_to_cpu(lp[index])); if (error) @@ -1474,8 +1485,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, return error; } -int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir, struct file_ra_state *f_ra) +int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra) { struct gfs2_inode *dip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1489,7 +1500,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, return 0; if (dip->i_diskflags & GFS2_DIF_EXHASH) - return dir_e_read(inode, offset, opaque, filldir, f_ra); + return dir_e_read(inode, ctx, f_ra); if (!gfs2_is_stuffed(dip)) { gfs2_consist_inode(dip); @@ -1521,7 +1532,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, error = -EIO; goto out; } - error = do_filldir_main(dip, offset, opaque, filldir, darr, + error = do_filldir_main(dip, ctx, darr, dip->i_entries, &copied); out: kfree(darr); @@ -1537,9 +1548,9 @@ out: /** * gfs2_dir_search - Search a directory - * @dip: The GFS2 inode - * @filename: - * @inode: + * @dip: The GFS2 dir inode + * @name: The name we are looking up + * @fail_on_exist: Fail if the name exists rather than looking it up * * This routine searches a directory for a file or another directory. * Assumes a glock is held on dip. @@ -1547,22 +1558,25 @@ out: * Returns: errno */ -struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name) +struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name, + bool fail_on_exist) { struct buffer_head *bh; struct gfs2_dirent *dent; - struct inode *inode; + u64 addr, formal_ino; + u16 dtype; dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); if (dent) { if (IS_ERR(dent)) return ERR_CAST(dent); - inode = gfs2_inode_lookup(dir->i_sb, - be16_to_cpu(dent->de_type), - be64_to_cpu(dent->de_inum.no_addr), - be64_to_cpu(dent->de_inum.no_formal_ino), 0); + dtype = be16_to_cpu(dent->de_type); + addr = be64_to_cpu(dent->de_inum.no_addr); + formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino); brelse(bh); - return inode; + if (fail_on_exist) + return ERR_PTR(-EEXIST); + return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0); } return ERR_PTR(-ENOENT); } @@ -1846,6 +1860,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); ht = kzalloc(size, GFP_NOFS); + if (ht == NULL) + ht = vzalloc(size); if (!ht) return -ENOMEM; @@ -1933,7 +1949,10 @@ out_rlist: gfs2_rlist_free(&rlist); gfs2_quota_unhold(dip); out: - kfree(ht); + if (is_vmalloc_addr(ht)) + vfree(ht); + else + kfree(ht); return error; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 98c960beab35..4f03bbd1873f 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -18,14 +18,15 @@ struct gfs2_inode; struct gfs2_inum; extern struct inode *gfs2_dir_search(struct inode *dir, - const struct qstr *filename); + const struct qstr *filename, + bool fail_on_exist); extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, const struct gfs2_inode *ip); extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, const struct gfs2_inode *ip); extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); -extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir, struct file_ra_state *f_ra); +extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra); extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, const struct gfs2_inode *nip, unsigned int new_type); diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 9973df4ff565..8b9b3775e2e7 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -64,6 +64,7 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len, } struct get_name_filldir { + struct dir_context ctx; struct gfs2_inum_host inum; char *name; }; @@ -88,9 +89,11 @@ static int gfs2_get_name(struct dentry *parent, char *name, struct inode *dir = parent->d_inode; struct inode *inode = child->d_inode; struct gfs2_inode *dip, *ip; - struct get_name_filldir gnfd; + struct get_name_filldir gnfd = { + .ctx.actor = get_name_filldir, + .name = name + }; struct gfs2_holder gh; - u64 offset = 0; int error; struct file_ra_state f_ra = { .start = 0 }; @@ -106,13 +109,12 @@ static int gfs2_get_name(struct dentry *parent, char *name, *name = 0; gnfd.inum.no_addr = ip->i_no_addr; gnfd.inum.no_formal_ino = ip->i_no_formal_ino; - gnfd.name = name; error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh); if (error) return error; - error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra); + error = gfs2_dir_read(dir, &gnfd.ctx, &f_ra); gfs2_glock_dq_uninit(&gh); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index d79c2dadc536..72c3866a7320 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -25,6 +25,7 @@ #include <asm/uaccess.h> #include <linux/dlm.h> #include <linux/dlm_plock.h> +#include <linux/aio.h> #include "gfs2.h" #include "incore.h" @@ -81,35 +82,28 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence) } /** - * gfs2_readdir - Read directory entries from a directory + * gfs2_readdir - Iterator for a directory * @file: The directory to read from - * @dirent: Buffer for dirents - * @filldir: Function used to do the copying + * @ctx: What to feed directory entries to * * Returns: errno */ -static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) +static int gfs2_readdir(struct file *file, struct dir_context *ctx) { struct inode *dir = file->f_mapping->host; struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_holder d_gh; - u64 offset = file->f_pos; int error; - gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); - error = gfs2_glock_nq(&d_gh); - if (error) { - gfs2_holder_uninit(&d_gh); + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) return error; - } - error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra); + error = gfs2_dir_read(dir, ctx, &file->f_ra); gfs2_glock_dq_uninit(&d_gh); - file->f_pos = offset; - return error; } @@ -401,16 +395,20 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) /* Update file times before taking page lock */ file_update_time(vma->vm_file); + ret = get_write_access(inode); + if (ret) + goto out; + ret = gfs2_rs_alloc(ip); if (ret) - return ret; + goto out_write_access; gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) - goto out; + goto out_uninit; set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); @@ -479,12 +477,15 @@ out_quota_unlock: gfs2_quota_unlock(ip); out_unlock: gfs2_glock_dq(&gh); -out: +out_uninit: gfs2_holder_uninit(&gh); if (ret == 0) { set_page_dirty(page); wait_for_stable_page(page); } +out_write_access: + put_write_access(inode); +out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); } @@ -530,21 +531,30 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) } /** - * gfs2_open - open a file - * @inode: the inode to open - * @file: the struct file for this opening + * gfs2_open_common - This is common to open and atomic_open + * @inode: The inode being opened + * @file: The file being opened * - * Returns: errno + * This maybe called under a glock or not depending upon how it has + * been called. We must always be called under a glock for regular + * files, however. For other file types, it does not matter whether + * we hold the glock or not. + * + * Returns: Error code or 0 for success */ -static int gfs2_open(struct inode *inode, struct file *file) +int gfs2_open_common(struct inode *inode, struct file *file) { - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder i_gh; struct gfs2_file *fp; - int error; + int ret; + + if (S_ISREG(inode->i_mode)) { + ret = generic_file_open(inode, file); + if (ret) + return ret; + } - fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL); + fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS); if (!fp) return -ENOMEM; @@ -552,29 +562,43 @@ static int gfs2_open(struct inode *inode, struct file *file) gfs2_assert_warn(GFS2_SB(inode), !file->private_data); file->private_data = fp; + return 0; +} + +/** + * gfs2_open - open a file + * @inode: the inode to open + * @file: the struct file for this opening + * + * After atomic_open, this function is only used for opening files + * which are already cached. We must still get the glock for regular + * files to ensure that we have the file size uptodate for the large + * file check which is in the common code. That is only an issue for + * regular files though. + * + * Returns: errno + */ + +static int gfs2_open(struct inode *inode, struct file *file) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + int error; + bool need_unlock = false; if (S_ISREG(ip->i_inode.i_mode)) { error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) - goto fail; + return error; + need_unlock = true; + } - if (!(file->f_flags & O_LARGEFILE) && - i_size_read(inode) > MAX_NON_LFS) { - error = -EOVERFLOW; - goto fail_gunlock; - } + error = gfs2_open_common(inode, file); + if (need_unlock) gfs2_glock_dq_uninit(&i_gh); - } - - return 0; -fail_gunlock: - gfs2_glock_dq_uninit(&i_gh); -fail: - file->private_data = NULL; - kfree(fp); return error; } @@ -593,10 +617,10 @@ static int gfs2_release(struct inode *inode, struct file *file) kfree(file->private_data); file->private_data = NULL; - if ((file->f_mode & FMODE_WRITE) && - (atomic_read(&inode->i_writecount) == 1)) - gfs2_rs_delete(ip); + if (!(file->f_mode & FMODE_WRITE)) + return 0; + gfs2_rs_delete(ip); return 0; } @@ -888,7 +912,7 @@ out_uninit: * cluster; until we do, disable leases (by just returning -EINVAL), * unless the administrator has requested purely local locking. * - * Locking: called under lock_flocks + * Locking: called under i_lock * * Returns: errno */ @@ -1040,7 +1064,7 @@ const struct file_operations gfs2_file_fops = { }; const struct file_operations gfs2_dir_fops = { - .readdir = gfs2_readdir, + .iterate = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, .open = gfs2_open, .release = gfs2_release, @@ -1070,7 +1094,7 @@ const struct file_operations gfs2_file_fops_nolock = { }; const struct file_operations gfs2_dir_fops_nolock = { - .readdir = gfs2_readdir, + .iterate = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, .open = gfs2_open, .release = gfs2_release, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index cf3515546739..9435384562a2 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -912,7 +912,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh) */ static void handle_callback(struct gfs2_glock *gl, unsigned int state, - unsigned long delay) + unsigned long delay, bool remote) { int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; @@ -925,8 +925,8 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, gl->gl_demote_state = LM_ST_UNLOCKED; } if (gl->gl_ops->go_callback) - gl->gl_ops->go_callback(gl); - trace_gfs2_demote_rq(gl); + gl->gl_ops->go_callback(gl, remote); + trace_gfs2_demote_rq(gl, remote); } void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) @@ -1017,11 +1017,11 @@ do_cancel: return; trap_recursive: - print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip); + printk(KERN_ERR "original: %pSR\n", (void *)gh2->gh_ip); printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid)); printk(KERN_ERR "lock type: %d req lock state : %d\n", gh2->gh_gl->gl_name.ln_type, gh2->gh_state); - print_symbol(KERN_ERR "new: %s\n", gh->gh_ip); + printk(KERN_ERR "new: %pSR\n", (void *)gh->gh_ip); printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid)); printk(KERN_ERR "lock type: %d req lock state : %d\n", gh->gh_gl->gl_name.ln_type, gh->gh_state); @@ -1091,7 +1091,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) spin_lock(&gl->gl_spin); if (gh->gh_flags & GL_NOCACHE) - handle_callback(gl, LM_ST_UNLOCKED, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0, false); list_del_init(&gh->gh_list); if (find_first_holder(gl) == NULL) { @@ -1279,19 +1279,6 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) gfs2_glock_dq(&ghs[num_gh]); } -/** - * gfs2_glock_dq_uninit_m - release multiple glocks - * @num_gh: the number of structures - * @ghs: an array of struct gfs2_holder structures - * - */ - -void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) -{ - while (num_gh--) - gfs2_glock_dq_uninit(&ghs[num_gh]); -} - void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) { unsigned long delay = 0; @@ -1309,7 +1296,7 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) } spin_lock(&gl->gl_spin); - handle_callback(gl, state, delay); + handle_callback(gl, state, delay, true); spin_unlock(&gl->gl_spin); if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) gfs2_glock_put(gl); @@ -1422,7 +1409,7 @@ __acquires(&lru_lock) spin_unlock(&lru_lock); spin_lock(&gl->gl_spin); if (demote_ok(gl)) - handle_callback(gl, LM_ST_UNLOCKED, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0, false); WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); smp_mb__after_clear_bit(); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) @@ -1547,7 +1534,7 @@ static void clear_glock(struct gfs2_glock *gl) spin_lock(&gl->gl_spin); if (gl->gl_state != LM_ST_UNLOCKED) - handle_callback(gl, LM_ST_UNLOCKED, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0, false); spin_unlock(&gl->gl_spin); gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) @@ -1590,6 +1577,7 @@ static void dump_glock_func(struct gfs2_glock *gl) void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) { set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); + flush_workqueue(glock_workqueue); glock_hash_walk(clear_glock, sdp); flush_workqueue(glock_workqueue); wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index fd580b7861d5..69f66e3d22bf 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -201,7 +201,6 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, struct gfs2_holder *gh); extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); -extern void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) extern __printf(2, 3) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 444b6503ebc4..5f2e5224c51c 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -54,7 +54,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) struct gfs2_bufdata *bd, *tmp; struct buffer_head *bh; const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock); - sector_t blocknr; gfs2_log_lock(sdp); spin_lock(&sdp->sd_ail_lock); @@ -65,13 +64,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) continue; gfs2_ail_error(gl, bh); } - blocknr = bh->b_blocknr; - bh->b_private = NULL; - gfs2_remove_from_ail(bd); /* drops ref on bh */ - - bd->bd_bh = NULL; - bd->bd_blkno = blocknr; - gfs2_trans_add_revoke(sdp, bd); } GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count)); @@ -515,12 +507,12 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl) * * gl_spin lock is held while calling this */ -static void iopen_go_callback(struct gfs2_glock *gl) +static void iopen_go_callback(struct gfs2_glock *gl, bool remote) { struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; struct gfs2_sbd *sdp = gl->gl_sbd; - if (sdp->sd_vfs->s_flags & MS_RDONLY) + if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY)) return; if (gl->gl_demote_state == LM_ST_UNLOCKED && diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 5c29216e9cc1..26aabd7caba7 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -31,7 +31,6 @@ struct gfs2_holder; struct gfs2_glock; struct gfs2_quota_data; struct gfs2_trans; -struct gfs2_ail; struct gfs2_jdesc; struct gfs2_sbd; struct lm_lockops; @@ -53,7 +52,7 @@ struct gfs2_log_header_host { struct gfs2_log_operations { void (*lo_before_commit) (struct gfs2_sbd *sdp); - void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai); + void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); void (*lo_before_scan) (struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass); int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start, @@ -139,7 +138,7 @@ struct gfs2_bufdata { struct list_head bd_list; const struct gfs2_log_operations *bd_ops; - struct gfs2_ail *bd_ail; + struct gfs2_trans *bd_tr; struct list_head bd_ail_st_list; struct list_head bd_ail_gl_list; }; @@ -211,7 +210,7 @@ struct gfs2_glock_operations { int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); - void (*go_callback) (struct gfs2_glock *gl); + void (*go_callback)(struct gfs2_glock *gl, bool remote); const int go_type; const unsigned long go_flags; #define GLOF_ASPACE 1 @@ -433,6 +432,7 @@ struct gfs2_trans { struct gfs2_holder tr_t_gh; int tr_touched; + int tr_attached; unsigned int tr_num_buf_new; unsigned int tr_num_databuf_new; @@ -440,14 +440,12 @@ struct gfs2_trans { unsigned int tr_num_databuf_rm; unsigned int tr_num_revoke; unsigned int tr_num_revoke_rm; -}; -struct gfs2_ail { - struct list_head ai_list; + struct list_head tr_list; - unsigned int ai_first; - struct list_head ai_ail1_list; - struct list_head ai_ail2_list; + unsigned int tr_first; + struct list_head tr_ail1_list; + struct list_head tr_ail2_list; }; struct gfs2_journal_extent { @@ -710,6 +708,7 @@ struct gfs2_sbd { spinlock_t sd_log_lock; + struct gfs2_trans *sd_log_tr; unsigned int sd_log_blks_reserved; unsigned int sd_log_commited_buf; unsigned int sd_log_commited_databuf; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index cc00bd1d1f87..bbb2715171cd 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -189,6 +189,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, return inode; fail_refresh: + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; ip->i_iopen_gh.gh_gl->gl_object = NULL; gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_iopen: @@ -312,7 +313,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, goto out; } - inode = gfs2_dir_search(dir, name); + inode = gfs2_dir_search(dir, name, false); if (IS_ERR(inode)) error = PTR_ERR(inode); out: @@ -345,17 +346,6 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, if (!dip->i_inode.i_nlink) return -ENOENT; - error = gfs2_dir_check(&dip->i_inode, name, NULL); - switch (error) { - case -ENOENT: - error = 0; - break; - case 0: - return -EEXIST; - default: - return error; - } - if (dip->i_entries == (u32)-1) return -EFBIG; if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1) @@ -392,11 +382,15 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags) int error; int dblocks = 1; - error = gfs2_inplace_reserve(ip, RES_DINODE, flags); + error = gfs2_quota_lock_check(ip); if (error) goto out; - error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0); + error = gfs2_inplace_reserve(ip, RES_DINODE, flags); + if (error) + goto out_quota; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipreserv; @@ -409,6 +403,8 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags) out_ipreserv: gfs2_inplace_release(ip); +out_quota: + gfs2_quota_unlock(ip); out: return error; } @@ -440,59 +436,27 @@ static void gfs2_init_dir(struct buffer_head *dibh, */ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, - const char *symname, struct buffer_head **bhp) + const char *symname) { - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_dinode *di; struct buffer_head *dibh; - struct timespec tv = CURRENT_TIME; dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr); gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI); - gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); di = (struct gfs2_dinode *)dibh->b_data; + gfs2_dinode_out(ip, di); - di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); - di->di_num.no_addr = cpu_to_be64(ip->i_no_addr); - di->di_mode = cpu_to_be32(ip->i_inode.i_mode); - di->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode)); - di->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode)); - di->di_nlink = 0; - di->di_size = cpu_to_be64(ip->i_inode.i_size); - di->di_blocks = cpu_to_be64(1); - di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec); di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev)); di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev)); - di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_no_addr); - di->di_generation = cpu_to_be64(ip->i_generation); - di->di_flags = 0; di->__pad1 = 0; - di->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) ? GFS2_FORMAT_DE : 0); - di->di_height = 0; di->__pad2 = 0; di->__pad3 = 0; - di->di_depth = 0; - di->di_entries = 0; memset(&di->__pad4, 0, sizeof(di->__pad4)); - di->di_eattr = 0; - di->di_atime_nsec = cpu_to_be32(tv.tv_nsec); - di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec); - di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec); memset(&di->di_reserved, 0, sizeof(di->di_reserved)); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); switch(ip->i_inode.i_mode & S_IFMT) { - case S_IFREG: - if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || - gfs2_tune_get(sdp, gt_new_files_jdata)) - di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); - break; case S_IFDIR: - di->di_flags |= cpu_to_be32(dip->i_diskflags & - GFS2_DIF_INHERIT_JDATA); - di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); - di->di_size = cpu_to_be64(sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)); - di->di_entries = cpu_to_be32(2); gfs2_init_dir(dibh, dip); break; case S_IFLNK: @@ -501,63 +465,17 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, } set_buffer_uptodate(dibh); - - *bhp = dibh; -} - -static int make_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, - const char *symname, struct buffer_head **bhp) -{ - struct inode *inode = &ip->i_inode; - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - int error; - - error = gfs2_rindex_update(sdp); - if (error) - return error; - - error = gfs2_quota_lock(dip, inode->i_uid, inode->i_gid); - if (error) - return error; - - error = gfs2_quota_check(dip, inode->i_uid, inode->i_gid); - if (error) - goto out_quota; - - error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0); - if (error) - goto out_quota; - - init_dinode(dip, ip, symname, bhp); - gfs2_quota_change(dip, +1, inode->i_uid, inode->i_gid); - gfs2_trans_end(sdp); - -out_quota: - gfs2_quota_unlock(dip); - return error; + brelse(dibh); } static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip) + struct gfs2_inode *ip, int arq) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - int alloc_required; - struct buffer_head *dibh; int error; - error = gfs2_rindex_update(sdp); - if (error) - return error; - - error = gfs2_quota_lock(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); - if (error) - goto fail; - - error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name); - if (alloc_required < 0) - goto fail_quota_locks; - if (alloc_required) { - error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid); + if (arq) { + error = gfs2_quota_lock_check(dip); if (error) goto fail_quota_locks; @@ -581,26 +499,12 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) goto fail_end_trans; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto fail_end_trans; - set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1); - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - return 0; - fail_end_trans: gfs2_trans_end(sdp); - fail_ipreserv: - if (alloc_required) - gfs2_inplace_release(dip); - + gfs2_inplace_release(dip); fail_quota_locks: gfs2_quota_unlock(dip); - -fail: return error; } @@ -631,6 +535,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, * gfs2_create_inode - Create a new inode * @dir: The parent directory * @dentry: The new dentry + * @file: If non-NULL, the file which is being opened * @mode: The permissions on the new inode * @dev: For device nodes, this is the device number * @symname: For symlinks, this is the link destination @@ -640,8 +545,9 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, */ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, + struct file *file, umode_t mode, dev_t dev, const char *symname, - unsigned int size, int excl) + unsigned int size, int excl, int *opened) { const struct qstr *name = &dentry->d_name; struct gfs2_holder ghs[2]; @@ -649,9 +555,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct gfs2_inode *dip = GFS2_I(dir), *ip; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_glock *io_gl; + struct dentry *d; int error; - struct buffer_head *bh = NULL; u32 aflags = 0; + int arq; if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; @@ -660,36 +567,81 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) return error; + error = gfs2_rindex_update(sdp); + if (error) + return error; + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); if (error) goto fail; error = create_ok(dip, name, mode); - if ((error == -EEXIST) && S_ISREG(mode) && !excl) { - inode = gfs2_lookupi(dir, &dentry->d_name, 0); - gfs2_glock_dq_uninit(ghs); - d_instantiate(dentry, inode); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; - } if (error) goto fail_gunlock; - inode = new_inode(sdp->sd_vfs); - if (!inode) { + inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl); + error = PTR_ERR(inode); + if (!IS_ERR(inode)) { + d = d_splice_alias(inode, dentry); + error = 0; + if (file && !IS_ERR(d)) { + if (d == NULL) + d = dentry; + if (S_ISREG(inode->i_mode)) + error = finish_open(file, d, gfs2_open_common, opened); + else + error = finish_no_open(file, d); + } gfs2_glock_dq_uninit(ghs); - return -ENOMEM; + if (IS_ERR(d)) + return PTR_RET(d); + return error; + } else if (error != -ENOENT) { + goto fail_gunlock; } + + arq = error = gfs2_diradd_alloc_required(dir, name); + if (error < 0) + goto fail_gunlock; + + inode = new_inode(sdp->sd_vfs); + error = -ENOMEM; + if (!inode) + goto fail_gunlock; + ip = GFS2_I(inode); error = gfs2_rs_alloc(ip); if (error) goto fail_free_inode; - set_bit(GIF_INVALID, &ip->i_flags); inode->i_mode = mode; + set_nlink(inode, S_ISDIR(mode) ? 2 : 1); inode->i_rdev = dev; inode->i_size = size; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + gfs2_set_inode_blocks(inode, 1); munge_mode_uid_gid(dip, inode); ip->i_goal = dip->i_goal; + ip->i_diskflags = 0; + ip->i_eattr = 0; + ip->i_height = 0; + ip->i_depth = 0; + ip->i_entries = 0; + + switch(mode & S_IFMT) { + case S_IFREG: + if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || + gfs2_tune_get(sdp, gt_new_files_jdata)) + ip->i_diskflags |= GFS2_DIF_JDATA; + gfs2_set_aops(inode); + break; + case S_IFDIR: + ip->i_diskflags |= (dip->i_diskflags & GFS2_DIF_INHERIT_JDATA); + ip->i_diskflags |= GFS2_DIF_JDATA; + ip->i_entries = 2; + break; + } + gfs2_set_inode_flags(inode); if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) || (dip->i_diskflags & GFS2_DIF_TOPDIR)) @@ -708,10 +660,13 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_free_inode; - error = make_dinode(dip, ip, symname, &bh); + error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) goto fail_gunlock2; + init_dinode(dip, ip, symname); + gfs2_trans_end(sdp); + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (error) goto fail_gunlock2; @@ -725,10 +680,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, gfs2_set_iop(inode); insert_inode_hash(inode); - error = gfs2_inode_refresh(ip); - if (error) - goto fail_gunlock3; - error = gfs2_acl_create(dip, inode); if (error) goto fail_gunlock3; @@ -737,20 +688,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock3; - error = link_dinode(dip, name, ip); + error = link_dinode(dip, name, ip, arq); if (error) goto fail_gunlock3; - if (bh) - brelse(bh); - - gfs2_trans_end(sdp); - gfs2_inplace_release(dip); - gfs2_quota_unlock(dip); mark_inode_dirty(inode); - gfs2_glock_dq_uninit_m(2, ghs); d_instantiate(dentry, inode); - return 0; + if (file) + error = finish_open(file, dentry, gfs2_open_common, opened); + gfs2_glock_dq_uninit(ghs); + gfs2_glock_dq_uninit(ghs + 1); + return error; fail_gunlock3: gfs2_glock_dq_uninit(ghs + 1); @@ -769,12 +717,12 @@ fail_free_inode: fail_gunlock: gfs2_glock_dq_uninit(ghs); if (inode && !IS_ERR(inode)) { + clear_nlink(inode); + mark_inode_dirty(inode); set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags); iput(inode); } fail: - if (bh) - brelse(bh); return error; } @@ -790,36 +738,56 @@ fail: static int gfs2_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl); + return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL); } /** - * gfs2_lookup - Look up a filename in a directory and return its inode + * __gfs2_lookup - Look up a filename in a directory and return its inode * @dir: The directory inode * @dentry: The dentry of the new inode - * @nd: passed from Linux VFS, ignored by us + * @file: File to be opened + * @opened: atomic_open flags * - * Called by the VFS layer. Lock dir and call gfs2_lookupi() * * Returns: errno */ -static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) +static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, + struct file *file, int *opened) { - struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0); - if (inode && !IS_ERR(inode)) { - struct gfs2_glock *gl = GFS2_I(inode)->i_gl; - struct gfs2_holder gh; - int error; - error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); - if (error) { - iput(inode); - return ERR_PTR(error); - } - gfs2_glock_dq_uninit(&gh); + struct inode *inode; + struct dentry *d; + struct gfs2_holder gh; + struct gfs2_glock *gl; + int error; + + inode = gfs2_lookupi(dir, &dentry->d_name, 0); + if (!inode) + return NULL; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + gl = GFS2_I(inode)->i_gl; + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) { + iput(inode); + return ERR_PTR(error); } - return d_splice_alias(inode, dentry); + + d = d_splice_alias(inode, dentry); + if (file && S_ISREG(inode->i_mode)) + error = finish_open(file, dentry, gfs2_open_common, opened); + + gfs2_glock_dq_uninit(&gh); + if (error) + return ERR_PTR(error); + return d; +} + +static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, + unsigned flags) +{ + return __gfs2_lookup(dir, dentry, NULL, NULL); } /** @@ -1137,7 +1105,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) return -ENAMETOOLONG; - return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0); + return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL); } /** @@ -1151,7 +1119,9 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0); + struct gfs2_sbd *sdp = GFS2_SB(dir); + unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); + return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL); } /** @@ -1166,7 +1136,43 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0); + return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL); +} + +/** + * gfs2_atomic_open - Atomically open a file + * @dir: The directory + * @dentry: The proposed new entry + * @file: The proposed new struct file + * @flags: open flags + * @mode: File mode + * @opened: Flag to say whether the file has been opened or not + * + * Returns: error code or 0 for success + */ + +static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, + umode_t mode, int *opened) +{ + struct dentry *d; + bool excl = !!(flags & O_EXCL); + + d = __gfs2_lookup(dir, dentry, file, opened); + if (IS_ERR(d)) + return PTR_ERR(d); + if (d == NULL) + d = dentry; + if (d->d_inode) { + if (!(*opened & FILE_OPENED)) + return finish_no_open(file, d); + return 0; + } + + if (!(flags & O_CREAT)) + return -ENOENT; + + return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened); } /* @@ -1846,6 +1852,7 @@ const struct inode_operations gfs2_dir_iops = { .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, .get_acl = gfs2_get_acl, + .atomic_open = gfs2_atomic_open, }; const struct inode_operations gfs2_symlink_iops = { diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index c53c7477f6da..ba4d9492d422 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -109,6 +109,7 @@ extern int gfs2_permission(struct inode *inode, int mask); extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); +extern int gfs2_open_common(struct inode *inode, struct file *file); extern const struct inode_operations gfs2_file_iops; extern const struct inode_operations gfs2_dir_iops; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 9a2ca8be7647..610613fb65b5 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -73,7 +73,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, void gfs2_remove_from_ail(struct gfs2_bufdata *bd) { - bd->bd_ail = NULL; + bd->bd_tr = NULL; list_del_init(&bd->bd_ail_st_list); list_del_init(&bd->bd_ail_gl_list); atomic_dec(&bd->bd_gl->gl_ail_count); @@ -90,7 +90,7 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) static int gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct writeback_control *wbc, - struct gfs2_ail *ai) + struct gfs2_trans *tr) __releases(&sdp->sd_ail_lock) __acquires(&sdp->sd_ail_lock) { @@ -99,15 +99,15 @@ __acquires(&sdp->sd_ail_lock) struct gfs2_bufdata *bd, *s; struct buffer_head *bh; - list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, bd_ail_st_list) { + list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; - gfs2_assert(sdp, bd->bd_ail == ai); + gfs2_assert(sdp, bd->bd_tr == tr); if (!buffer_busy(bh)) { if (!buffer_uptodate(bh)) gfs2_io_error_bh(sdp, bh); - list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); + list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); continue; } @@ -116,7 +116,7 @@ __acquires(&sdp->sd_ail_lock) if (gl == bd->bd_gl) continue; gl = bd->bd_gl; - list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list); + list_move(&bd->bd_ail_st_list, &tr->tr_ail1_list); mapping = bh->b_page->mapping; if (!mapping) continue; @@ -144,15 +144,15 @@ __acquires(&sdp->sd_ail_lock) void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) { struct list_head *head = &sdp->sd_ail1_list; - struct gfs2_ail *ai; + struct gfs2_trans *tr; trace_gfs2_ail_flush(sdp, wbc, 1); spin_lock(&sdp->sd_ail_lock); restart: - list_for_each_entry_reverse(ai, head, ai_list) { + list_for_each_entry_reverse(tr, head, tr_list) { if (wbc->nr_to_write <= 0) break; - if (gfs2_ail1_start_one(sdp, wbc, ai)) + if (gfs2_ail1_start_one(sdp, wbc, tr)) goto restart; } spin_unlock(&sdp->sd_ail_lock); @@ -183,20 +183,20 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp) * */ -static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct gfs2_bufdata *bd, *s; struct buffer_head *bh; - list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, + list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; - gfs2_assert(sdp, bd->bd_ail == ai); + gfs2_assert(sdp, bd->bd_tr == tr); if (buffer_busy(bh)) continue; if (!buffer_uptodate(bh)) gfs2_io_error_bh(sdp, bh); - list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); + list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); } } @@ -210,16 +210,17 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) static int gfs2_ail1_empty(struct gfs2_sbd *sdp) { - struct gfs2_ail *ai, *s; + struct gfs2_trans *tr, *s; + int oldest_tr = 1; int ret; spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) { - gfs2_ail1_empty_one(sdp, ai); - if (list_empty(&ai->ai_ail1_list)) - list_move(&ai->ai_list, &sdp->sd_ail2_list); + list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { + gfs2_ail1_empty_one(sdp, tr); + if (list_empty(&tr->tr_ail1_list) && oldest_tr) + list_move(&tr->tr_list, &sdp->sd_ail2_list); else - break; + oldest_tr = 0; } ret = list_empty(&sdp->sd_ail1_list); spin_unlock(&sdp->sd_ail_lock); @@ -229,13 +230,13 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp) static void gfs2_ail1_wait(struct gfs2_sbd *sdp) { - struct gfs2_ail *ai; + struct gfs2_trans *tr; struct gfs2_bufdata *bd; struct buffer_head *bh; spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_reverse(ai, &sdp->sd_ail1_list, ai_list) { - list_for_each_entry(bd, &ai->ai_ail1_list, bd_ail_st_list) { + list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry(bd, &tr->tr_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; if (!buffer_locked(bh)) continue; @@ -256,40 +257,40 @@ static void gfs2_ail1_wait(struct gfs2_sbd *sdp) * */ -static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - struct list_head *head = &ai->ai_ail2_list; + struct list_head *head = &tr->tr_ail2_list; struct gfs2_bufdata *bd; while (!list_empty(head)) { bd = list_entry(head->prev, struct gfs2_bufdata, bd_ail_st_list); - gfs2_assert(sdp, bd->bd_ail == ai); + gfs2_assert(sdp, bd->bd_tr == tr); gfs2_remove_from_ail(bd); } } static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) { - struct gfs2_ail *ai, *safe; + struct gfs2_trans *tr, *safe; unsigned int old_tail = sdp->sd_log_tail; int wrap = (new_tail < old_tail); int a, b, rm; spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) { - a = (old_tail <= ai->ai_first); - b = (ai->ai_first < new_tail); + list_for_each_entry_safe(tr, safe, &sdp->sd_ail2_list, tr_list) { + a = (old_tail <= tr->tr_first); + b = (tr->tr_first < new_tail); rm = (wrap) ? (a || b) : (a && b); if (!rm) continue; - gfs2_ail2_empty_one(sdp, ai); - list_del(&ai->ai_list); - gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list)); - gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list)); - kfree(ai); + gfs2_ail2_empty_one(sdp, tr); + list_del(&tr->tr_list); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); + kfree(tr); } spin_unlock(&sdp->sd_ail_lock); @@ -317,7 +318,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) { - unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize); + unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize); unsigned wanted = blks + reserved_blks; DEFINE_WAIT(wait); int did_wait = 0; @@ -435,7 +436,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp) static unsigned int current_tail(struct gfs2_sbd *sdp) { - struct gfs2_ail *ai; + struct gfs2_trans *tr; unsigned int tail; spin_lock(&sdp->sd_ail_lock); @@ -443,8 +444,9 @@ static unsigned int current_tail(struct gfs2_sbd *sdp) if (list_empty(&sdp->sd_ail1_list)) { tail = sdp->sd_log_head; } else { - ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list); - tail = ai->ai_first; + tr = list_entry(sdp->sd_ail1_list.prev, struct gfs2_trans, + tr_list); + tail = tr->tr_first; } spin_unlock(&sdp->sd_ail_lock); @@ -544,6 +546,76 @@ void gfs2_ordered_del_inode(struct gfs2_inode *ip) spin_unlock(&sdp->sd_ordered_lock); } +void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) +{ + struct buffer_head *bh = bd->bd_bh; + struct gfs2_glock *gl = bd->bd_gl; + + gfs2_remove_from_ail(bd); + bd->bd_bh = NULL; + bh->b_private = NULL; + bd->bd_blkno = bh->b_blocknr; + bd->bd_ops = &gfs2_revoke_lops; + sdp->sd_log_num_revoke++; + atomic_inc(&gl->gl_revokes); + set_bit(GLF_LFLUSH, &gl->gl_flags); + list_add(&bd->bd_list, &sdp->sd_log_le_revoke); +} + +void gfs2_write_revokes(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr; + struct gfs2_bufdata *bd, *tmp; + int have_revokes = 0; + int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); + + gfs2_ail1_empty(sdp); + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) { + if (list_empty(&bd->bd_list)) { + have_revokes = 1; + goto done; + } + } + } +done: + spin_unlock(&sdp->sd_ail_lock); + if (have_revokes == 0) + return; + while (sdp->sd_log_num_revoke > max_revokes) + max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); + max_revokes -= sdp->sd_log_num_revoke; + if (!sdp->sd_log_num_revoke) { + atomic_dec(&sdp->sd_log_blks_free); + /* If no blocks have been reserved, we need to also + * reserve a block for the header */ + if (!sdp->sd_log_blks_reserved) + atomic_dec(&sdp->sd_log_blks_free); + } + gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) { + if (max_revokes == 0) + goto out_of_blocks; + if (!list_empty(&bd->bd_list)) + continue; + gfs2_add_revoke(sdp, bd); + max_revokes--; + } + } +out_of_blocks: + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); + + if (!sdp->sd_log_num_revoke) { + atomic_inc(&sdp->sd_log_blks_free); + if (!sdp->sd_log_blks_reserved) + atomic_inc(&sdp->sd_log_blks_free); + } +} + /** * log_write_header - Get and initialize a journal header buffer * @sdp: The GFS2 superblock @@ -561,7 +633,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) lh = page_address(page); clear_page(lh); - gfs2_ail1_empty(sdp); tail = current_tail(sdp); lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); @@ -600,7 +671,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) { - struct gfs2_ail *ai; + struct gfs2_trans *tr; down_write(&sdp->sd_log_flush_lock); @@ -611,9 +682,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) } trace_gfs2_log_flush(sdp, 1); - ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); - INIT_LIST_HEAD(&ai->ai_ail1_list); - INIT_LIST_HEAD(&ai->ai_ail2_list); + tr = sdp->sd_log_tr; + if (tr) { + sdp->sd_log_tr = NULL; + INIT_LIST_HEAD(&tr->tr_ail1_list); + INIT_LIST_HEAD(&tr->tr_ail2_list); + } if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) { printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf, @@ -630,7 +704,8 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) sdp->sd_log_flush_head = sdp->sd_log_head; sdp->sd_log_flush_wrapped = 0; - ai->ai_first = sdp->sd_log_flush_head; + if (tr) + tr->tr_first = sdp->sd_log_flush_head; gfs2_ordered_write(sdp); lops_before_commit(sdp); @@ -643,7 +718,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) trace_gfs2_log_blocks(sdp, -1); log_write_header(sdp, 0); } - lops_after_commit(sdp, ai); + lops_after_commit(sdp, tr); gfs2_log_lock(sdp); sdp->sd_log_head = sdp->sd_log_flush_head; @@ -653,16 +728,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) sdp->sd_log_commited_revoke = 0; spin_lock(&sdp->sd_ail_lock); - if (!list_empty(&ai->ai_ail1_list)) { - list_add(&ai->ai_list, &sdp->sd_ail1_list); - ai = NULL; + if (tr && !list_empty(&tr->tr_ail1_list)) { + list_add(&tr->tr_list, &sdp->sd_ail1_list); + tr = NULL; } spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); trace_gfs2_log_flush(sdp, 0); up_write(&sdp->sd_log_flush_lock); - kfree(ai); + kfree(tr); } static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) @@ -687,6 +762,12 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) sdp->sd_jdesc->jd_blocks); sdp->sd_log_blks_reserved = reserved; + if (sdp->sd_log_tr == NULL && + (tr->tr_num_buf_new || tr->tr_num_databuf_new)) { + gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl); + sdp->sd_log_tr = tr; + tr->tr_attached = 1; + } gfs2_log_unlock(sdp); } @@ -708,7 +789,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { log_refund(sdp, tr); - up_read(&sdp->sd_log_flush_lock); if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) || ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) > diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 3566f35915e0..37216634f0aa 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -72,5 +72,7 @@ extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp); extern int gfs2_logd(void *data); +extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +extern void gfs2_write_revokes(struct gfs2_sbd *sdp); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index a5055977a214..17c5b5d7dc88 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -16,6 +16,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/bio.h> #include <linux/fs.h> +#include <linux/list_sort.h> #include "gfs2.h" #include "incore.h" @@ -53,8 +54,8 @@ void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) * to in-place disk block, remove it from the AIL. */ spin_lock(&sdp->sd_ail_lock); - if (bd->bd_ail) - list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); + if (bd->bd_tr) + list_move(&bd->bd_ail_st_list, &bd->bd_tr->tr_ail2_list); spin_unlock(&sdp->sd_ail_lock); get_bh(bh); atomic_inc(&sdp->sd_log_pinned); @@ -94,7 +95,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd) */ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, - struct gfs2_ail *ai) + struct gfs2_trans *tr) { struct gfs2_bufdata *bd = bh->b_private; @@ -109,7 +110,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, maybe_release_space(bd); spin_lock(&sdp->sd_ail_lock); - if (bd->bd_ail) { + if (bd->bd_tr) { list_del(&bd->bd_ail_st_list); brelse(bh); } else { @@ -117,8 +118,8 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list); atomic_inc(&gl->gl_ail_count); } - bd->bd_ail = ai; - list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); + bd->bd_tr = tr; + list_add(&bd->bd_ail_st_list, &tr->tr_ail1_list); spin_unlock(&sdp->sd_ail_lock); clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); @@ -212,7 +213,7 @@ static void gfs2_end_log_write(struct bio *bio, int error) fs_err(sdp, "Error %d writing to log\n", error); } - bio_for_each_segment(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; if (page_has_buffers(page)) gfs2_end_log_write_bh(sdp, bvec, error); @@ -300,7 +301,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) u64 nblk; if (bio) { - nblk = bio->bi_sector + bio_sectors(bio); + nblk = bio_end_sector(bio); nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk) return bio; @@ -401,6 +402,20 @@ static void gfs2_check_magic(struct buffer_head *bh) kunmap_atomic(kaddr); } +static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct gfs2_bufdata *bda, *bdb; + + bda = list_entry(a, struct gfs2_bufdata, bd_list); + bdb = list_entry(b, struct gfs2_bufdata, bd_list); + + if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr) + return -1; + if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr) + return 1; + return 0; +} + static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, unsigned int total, struct list_head *blist, bool is_databuf) @@ -413,13 +428,16 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, __be64 *ptr; gfs2_log_lock(sdp); + list_sort(NULL, blist, blocknr_cmp); bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list); while(total) { num = total; if (total > limit) num = limit; gfs2_log_unlock(sdp); - page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA, num + 1, num); + page = gfs2_get_log_desc(sdp, + is_databuf ? GFS2_LOG_DESC_JDATA : + GFS2_LOG_DESC_METADATA, num + 1, num); ld = page_address(page); gfs2_log_lock(sdp); ptr = (__be64 *)(ld + 1); @@ -480,17 +498,22 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) &sdp->sd_log_le_buf, 0); } -static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct list_head *head = &sdp->sd_log_le_buf; struct gfs2_bufdata *bd; + if (tr == NULL) { + gfs2_assert(sdp, list_empty(head)); + return; + } + while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); sdp->sd_log_num_buf--; - gfs2_unpin(sdp, bd->bd_bh, ai); + gfs2_unpin(sdp, bd->bd_bh, tr); } gfs2_assert_warn(sdp, !sdp->sd_log_num_buf); } @@ -583,6 +606,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) struct page *page; unsigned int length; + gfs2_write_revokes(sdp); if (!sdp->sd_log_num_revoke) return; @@ -613,7 +637,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) gfs2_log_write_page(sdp, page); } -static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct list_head *head = &sdp->sd_log_le_revoke; struct gfs2_bufdata *bd; @@ -791,16 +815,21 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); } -static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct list_head *head = &sdp->sd_log_le_databuf; struct gfs2_bufdata *bd; + if (tr == NULL) { + gfs2_assert(sdp, list_empty(head)); + return; + } + while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); sdp->sd_log_num_databuf--; - gfs2_unpin(sdp, bd->bd_bh, ai); + gfs2_unpin(sdp, bd->bd_bh, tr); } gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf); } @@ -824,10 +853,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = { .lo_name = "revoke", }; -const struct gfs2_log_operations gfs2_rg_lops = { - .lo_name = "rg", -}; - const struct gfs2_log_operations gfs2_databuf_lops = { .lo_before_commit = databuf_lo_before_commit, .lo_after_commit = databuf_lo_after_commit, @@ -839,7 +864,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = { const struct gfs2_log_operations *gfs2_log_ops[] = { &gfs2_databuf_lops, &gfs2_buf_lops, - &gfs2_rg_lops, &gfs2_revoke_lops, NULL, }; diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index ba77b7da8325..9ca2e6438419 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -23,7 +23,6 @@ extern const struct gfs2_log_operations gfs2_glock_lops; extern const struct gfs2_log_operations gfs2_buf_lops; extern const struct gfs2_log_operations gfs2_revoke_lops; -extern const struct gfs2_log_operations gfs2_rg_lops; extern const struct gfs2_log_operations gfs2_databuf_lops; extern const struct gfs2_log_operations *gfs2_log_ops[]; @@ -55,12 +54,13 @@ static inline void lops_before_commit(struct gfs2_sbd *sdp) gfs2_log_ops[x]->lo_before_commit(sdp); } -static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static inline void lops_after_commit(struct gfs2_sbd *sdp, + struct gfs2_trans *tr) { int x; for (x = 0; gfs2_log_ops[x]; x++) if (gfs2_log_ops[x]->lo_after_commit) - gfs2_log_ops[x]->lo_after_commit(sdp, ai); + gfs2_log_ops[x]->lo_after_commit(sdp, tr); } static inline void lops_before_scan(struct gfs2_jdesc *jd, diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index b059bbb5059e..0da390686c08 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -295,11 +295,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int } if (bd) { spin_lock(&sdp->sd_ail_lock); - if (bd->bd_ail) { - gfs2_remove_from_ail(bd); - bh->b_private = NULL; - bd->bd_bh = NULL; - bd->bd_blkno = bh->b_blocknr; + if (bd->bd_tr) { gfs2_trans_add_revoke(sdp, bd); } spin_unlock(&sdp->sd_ail_lock); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 60ede2a0f43f..0262c190b6f9 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -916,16 +916,16 @@ static int init_threads(struct gfs2_sbd *sdp, int undo) goto fail_quotad; p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); - error = IS_ERR(p); - if (error) { + if (IS_ERR(p)) { + error = PTR_ERR(p); fs_err(sdp, "can't start logd thread: %d\n", error); return error; } sdp->sd_logd_process = p; p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); - error = IS_ERR(p); - if (error) { + if (IS_ERR(p)) { + error = PTR_ERR(p); fs_err(sdp, "can't start quotad thread: %d\n", error); goto fail; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index c7c840e916f8..3768c2f40e43 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -121,7 +121,7 @@ static u64 qd2index(struct gfs2_quota_data *qd) { struct kqid qid = qd->qd_id; return (2 * (u64)from_kqid(&init_user_ns, qid)) + - (qid.type == USRQUOTA) ? 0 : 1; + ((qid.type == USRQUOTA) ? 0 : 1); } static u64 qd2offset(struct gfs2_quota_data *qd) @@ -721,7 +721,7 @@ get_a_page: goto unlock_out; } - gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_trans_add_data(ip->i_gl, bh); kaddr = kmap_atomic(page); if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) @@ -1154,11 +1154,6 @@ int gfs2_quota_sync(struct super_block *sb, int type) return error; } -static int gfs2_quota_sync_timeo(struct super_block *sb, int type) -{ - return gfs2_quota_sync(sb, type); -} - int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; @@ -1414,7 +1409,7 @@ int gfs2_quotad(void *data) &tune->gt_statfs_quantum); /* Update quota file */ - quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t, + quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, "ad_timeo, &tune->gt_quota_quantum); /* Check for & recover partially truncated inodes */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 5a51265a4341..69317435faa7 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -592,7 +592,7 @@ static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) * @rs: The reservation to remove * */ -static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs) +static void __rs_deltree(struct gfs2_blkreserv *rs) { struct gfs2_rgrpd *rgd; @@ -605,7 +605,7 @@ static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs) RB_CLEAR_NODE(&rs->rs_node); if (rs->rs_free) { - /* return reserved blocks to the rgrp and the ip */ + /* return reserved blocks to the rgrp */ BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; rs->rs_free = 0; @@ -619,14 +619,14 @@ static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs) * @rs: The reservation to remove * */ -void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs) +void gfs2_rs_deltree(struct gfs2_blkreserv *rs) { struct gfs2_rgrpd *rgd; rgd = rs->rs_rbm.rgd; if (rgd) { spin_lock(&rgd->rd_rsspin); - __rs_deltree(ip, rs); + __rs_deltree(rs); spin_unlock(&rgd->rd_rsspin); } } @@ -638,9 +638,11 @@ void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs) */ void gfs2_rs_delete(struct gfs2_inode *ip) { + struct inode *inode = &ip->i_inode; + down_write(&ip->i_rw_mutex); - if (ip->i_res) { - gfs2_rs_deltree(ip, ip->i_res); + if (ip->i_res && atomic_read(&inode->i_writecount) <= 1) { + gfs2_rs_deltree(ip->i_res); BUG_ON(ip->i_res->rs_free); kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); ip->i_res = NULL; @@ -664,7 +666,7 @@ static void return_all_reservations(struct gfs2_rgrpd *rgd) spin_lock(&rgd->rd_rsspin); while ((n = rb_first(&rgd->rd_rstree))) { rs = rb_entry(n, struct gfs2_blkreserv, rs_node); - __rs_deltree(NULL, rs); + __rs_deltree(rs); } spin_unlock(&rgd->rd_rsspin); } @@ -1286,13 +1288,15 @@ int gfs2_fitrim(struct file *filp, void __user *argp) minlen = max_t(u64, r.minlen, q->limits.discard_granularity) >> bs_shift; + if (end <= start || minlen > sdp->sd_max_rg_data) + return -EINVAL; + rgd = gfs2_blk2rgrpd(sdp, start, 0); - rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0); + rgd_end = gfs2_blk2rgrpd(sdp, end, 0); - if (end <= start || - minlen > sdp->sd_max_rg_data || - start > rgd_end->rd_data0 + rgd_end->rd_data) - return -EINVAL; + if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end)) + && (start > rgd_end->rd_data0 + rgd_end->rd_data)) + return -EINVAL; /* start is beyond the end of the fs */ while (1) { @@ -1334,7 +1338,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) } out: - r.len = trimmed << 9; + r.len = trimmed << bs_shift; if (copy_to_user(argp, &r, sizeof(r))) return -EFAULT; @@ -1401,9 +1405,14 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, u32 extlen; u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved; int ret; + struct inode *inode = &ip->i_inode; - extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested); - extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks); + if (S_ISDIR(inode->i_mode)) + extlen = 1; + else { + extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested); + extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks); + } if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen)) return; @@ -1874,7 +1883,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags) /* Drop reservation, if we couldn't use reserved rgrp */ if (gfs2_rs_active(rs)) - gfs2_rs_deltree(ip, rs); + gfs2_rs_deltree(rs); check_rgrp: /* Check for unlinked inodes which can be reclaimed */ if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) @@ -2087,7 +2096,7 @@ static void gfs2_adjust_reservation(struct gfs2_inode *ip, if (rs->rs_free && !ret) goto out; } - __rs_deltree(ip, rs); + __rs_deltree(rs); } out: spin_unlock(&rgd->rd_rsspin); @@ -2180,13 +2189,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, if (dinode) gfs2_trans_add_unrevoke(sdp, block, 1); - /* - * This needs reviewing to see why we cannot do the quota change - * at this point in the dinode case. - */ - if (ndata) - gfs2_quota_change(ip, ndata, ip->i_inode.i_uid, - ip->i_inode.i_gid); + gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid); rbm.rgd->rd_free_clone -= *nblocks; trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks, diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 842185853f6b..5b3f4a896e6c 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -47,7 +47,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, bool dinode, u64 *generation); extern int gfs2_rs_alloc(struct gfs2_inode *ip); -extern void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs); +extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); extern void gfs2_rs_delete(struct gfs2_inode *ip); extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index cab77b8ba84f..e5639dec66c4 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1444,6 +1444,7 @@ static void gfs2_evict_inode(struct inode *inode) /* Must not read inode block until block type has been verified */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); goto out; } @@ -1512,10 +1513,12 @@ out_truncate: out_unlock: /* Error path for case 1 */ if (gfs2_rs_active(ip->i_res)) - gfs2_rs_deltree(ip, ip->i_res); + gfs2_rs_deltree(ip->i_res); - if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) + if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq(&ip->i_iopen_gh); + } gfs2_holder_uninit(&ip->i_iopen_gh); gfs2_glock_dq_uninit(&gh); if (error && error != GLR_TRYFAILED && error != -EROFS) @@ -1534,6 +1537,7 @@ out: ip->i_gl = NULL; if (ip->i_iopen_gh.gh_gl) { ip->i_iopen_gh.gh_gl->gl_object = NULL; + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); } } diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 2ee13e841e9f..20c007d747ab 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -159,9 +159,9 @@ TRACE_EVENT(gfs2_glock_put, /* Callback (local or remote) requesting lock demotion */ TRACE_EVENT(gfs2_demote_rq, - TP_PROTO(const struct gfs2_glock *gl), + TP_PROTO(const struct gfs2_glock *gl, bool remote), - TP_ARGS(gl), + TP_ARGS(gl, remote), TP_STRUCT__entry( __field( dev_t, dev ) @@ -170,6 +170,7 @@ TRACE_EVENT(gfs2_demote_rq, __field( u8, cur_state ) __field( u8, dmt_state ) __field( unsigned long, flags ) + __field( bool, remote ) ), TP_fast_assign( @@ -179,14 +180,16 @@ TRACE_EVENT(gfs2_demote_rq, __entry->cur_state = glock_trace_state(gl->gl_state); __entry->dmt_state = glock_trace_state(gl->gl_demote_state); __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0); + __entry->remote = remote; ), - TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s", + TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, (unsigned long long)__entry->glnum, glock_trace_name(__entry->cur_state), glock_trace_name(__entry->dmt_state), - show_glock_flags(__entry->flags)) + show_glock_flags(__entry->flags), + __entry->remote ? "remote" : "local") ); diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 88162fae27a5..2b20d7046bf3 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -96,7 +96,8 @@ static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) static void gfs2_print_trans(const struct gfs2_trans *tr) { - print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip); + printk(KERN_WARNING "GFS2: Transaction created at: %pSR\n", + (void *)tr->tr_ip); printk(KERN_WARNING "GFS2: blocks=%u revokes=%u reserved=%u touched=%d\n", tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched); printk(KERN_WARNING "GFS2: Buf %u/%u Databuf %u/%u Revoke %u/%u\n", @@ -135,8 +136,10 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) if (tr->tr_t_gh.gh_gl) { gfs2_glock_dq(&tr->tr_t_gh); gfs2_holder_uninit(&tr->tr_t_gh); - kfree(tr); + if (!tr->tr_attached) + kfree(tr); } + up_read(&sdp->sd_log_flush_lock); if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) gfs2_log_flush(sdp, NULL); @@ -267,19 +270,12 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { - struct gfs2_glock *gl = bd->bd_gl; struct gfs2_trans *tr = current->journal_info; BUG_ON(!list_empty(&bd->bd_list)); - BUG_ON(!list_empty(&bd->bd_ail_st_list)); - BUG_ON(!list_empty(&bd->bd_ail_gl_list)); - bd->bd_ops = &gfs2_revoke_lops; + gfs2_add_revoke(sdp, bd); tr->tr_touched = 1; tr->tr_num_revoke++; - sdp->sd_log_num_revoke++; - atomic_inc(&gl->gl_revokes); - set_bit(GLF_LFLUSH, &gl->gl_flags); - list_add(&bd->bd_list, &sdp->sd_log_le_revoke); } void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index 571abe97b42a..de69d8a24f6d 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -22,7 +22,8 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); + hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + tree->cnid, __builtin_return_address(0)); mutex_lock(&tree->tree_lock); return 0; } @@ -31,7 +32,8 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); + hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", + fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } @@ -135,8 +137,8 @@ int hfs_brec_find(struct hfs_find_data *fd) return res; invalid: - printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", - height, bnode->height, bnode->type, nidx, parent); + pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", + height, bnode->height, bnode->type, nidx, parent); res = -EIO; release: hfs_bnode_put(bnode); diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c index c6e97366e8ac..28307bc9ec1e 100644 --- a/fs/hfs/bitmap.c +++ b/fs/hfs/bitmap.c @@ -158,7 +158,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits) } } - dprint(DBG_BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits); + hfs_dbg(BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits); HFS_SB(sb)->free_ablocks -= *num_bits; hfs_bitmap_dirty(sb); out: @@ -200,7 +200,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count) if (!count) return 0; - dprint(DBG_BITMAP, "clear_bits: %u,%u\n", start, count); + hfs_dbg(BITMAP, "clear_bits: %u,%u\n", start, count); /* are all of the bits in range? */ if ((start + count) > HFS_SB(sb)->fs_ablocks) return -2; diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index cdb41a1f6a64..d3fa6bd9503e 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -100,7 +100,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct hfs_btree *tree; struct page *src_page, *dst_page; - dprint(DBG_BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; tree = src_node->tree; @@ -120,7 +120,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) struct page *page; void *ptr; - dprint(DBG_BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); if (!len) return; src += node->page_offset; @@ -138,16 +138,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - dprint(DBG_BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - dprint(DBG_BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - dprint(DBG_BNODE_MOD, " %d", key_off); + hfs_dbg_cont(BNODE_MOD, " %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -155,17 +155,18 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1; else tmp = node->tree->max_key_len + 1; - dprint(DBG_BNODE_MOD, " (%d,%d", tmp, hfs_bnode_read_u8(node, key_off)); + hfs_dbg_cont(BNODE_MOD, " (%d,%d", + tmp, hfs_bnode_read_u8(node, key_off)); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - dprint(DBG_BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u8(node, key_off); - dprint(DBG_BNODE_MOD, " (%d)", tmp); + hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); } } - dprint(DBG_BNODE_MOD, "\n"); + hfs_dbg_cont(BNODE_MOD, "\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -220,7 +221,7 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) struct hfs_bnode *node; if (cnid >= tree->node_count) { - printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid); + pr_err("request for non-existent node %d in B*Tree\n", cnid); return NULL; } @@ -243,7 +244,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) loff_t off; if (cnid >= tree->node_count) { - printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid); + pr_err("request for non-existent node %d in B*Tree\n", cnid); return NULL; } @@ -257,8 +258,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - dprint(DBG_BNODE_REFS, "new_node(%d:%d): 1\n", - node->tree->cnid, node->this); + hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); node2 = hfs_bnode_findhash(tree, cnid); @@ -301,7 +302,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - dprint(DBG_BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -414,7 +415,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) spin_lock(&tree->hash_lock); node = hfs_bnode_findhash(tree, num); spin_unlock(&tree->hash_lock); - BUG_ON(node); + if (node) { + pr_crit("new node %u already hashed?\n", num); + WARN_ON(1); + return node; + } node = __hfs_bnode_create(tree, num); if (!node) return ERR_PTR(-ENOMEM); @@ -443,8 +448,9 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n", - node->tree->cnid, node->this, atomic_read(&node->refcnt)); + hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); } } @@ -455,8 +461,9 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n", - node->tree->cnid, node->this, atomic_read(&node->refcnt)); + hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) return; diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 92fb358ce824..9f4ee7f52026 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -47,15 +47,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) if (node->tree->attributes & HFS_TREE_BIGKEYS) { retval = hfs_bnode_read_u16(node, recoff) + 2; if (retval > node->tree->max_key_len + 2) { - printk(KERN_ERR "hfs: keylen %d too large\n", - retval); + pr_err("keylen %d too large\n", retval); retval = 0; } } else { retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1; if (retval > node->tree->max_key_len + 1) { - printk(KERN_ERR "hfs: keylen %d too large\n", - retval); + pr_err("keylen %d too large\n", retval); retval = 0; } } @@ -94,7 +92,8 @@ again: end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off); + hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) panic("not enough room!\n"); @@ -190,7 +189,8 @@ again: mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength); + hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", + fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); if (!node->parent) @@ -240,7 +240,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - dprint(DBG_BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -374,7 +374,8 @@ again: newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1; else fd->keylength = newkeylen = tree->max_key_len + 1; - dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen); + hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", + rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; @@ -385,7 +386,7 @@ again: end_off = hfs_bnode_read_u16(parent, end_rec_off); if (end_rec_off - end_off < diff) { - printk(KERN_DEBUG "hfs: splitting index node...\n"); + printk(KERN_DEBUG "splitting index node...\n"); fd->bnode = parent; new_node = hfs_bnode_split(fd); if (IS_ERR(new_node)) diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 1cbdeea1db44..1ab19e660e69 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -48,7 +48,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz)); if (HFS_I(tree->inode)->alloc_blocks > HFS_I(tree->inode)->first_blocks) { - printk(KERN_ERR "hfs: invalid btree extent records\n"); + pr_err("invalid btree extent records\n"); unlock_new_inode(tree->inode); goto free_inode; } @@ -60,8 +60,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz)); if (!HFS_I(tree->inode)->first_blocks) { - printk(KERN_ERR "hfs: invalid btree extent records " - "(0 size).\n"); + pr_err("invalid btree extent records (0 size)\n"); unlock_new_inode(tree->inode); goto free_inode; } @@ -100,15 +99,15 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke switch (id) { case HFS_EXT_CNID: if (tree->max_key_len != HFS_MAX_EXT_KEYLEN) { - printk(KERN_ERR "hfs: invalid extent max_key_len %d\n", - tree->max_key_len); + pr_err("invalid extent max_key_len %d\n", + tree->max_key_len); goto fail_page; } break; case HFS_CAT_CNID: if (tree->max_key_len != HFS_MAX_CAT_KEYLEN) { - printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n", - tree->max_key_len); + pr_err("invalid catalog max_key_len %d\n", + tree->max_key_len); goto fail_page; } break; @@ -146,8 +145,9 @@ void hfs_btree_close(struct hfs_btree *tree) while ((node = tree->node_hash[i])) { tree->node_hash[i] = node->next_hash; if (atomic_read(&node->refcnt)) - printk(KERN_ERR "hfs: node %d:%d still has %d user(s)!\n", - node->tree->cnid, node->this, atomic_read(&node->refcnt)); + pr_err("node %d:%d still has %d user(s)!\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); hfs_bnode_free(node); tree->node_hash_cnt--; } @@ -290,7 +290,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) kunmap(*pagep); nidx = node->next; if (!nidx) { - printk(KERN_DEBUG "hfs: create new bmap node...\n"); + printk(KERN_DEBUG "create new bmap node...\n"); next_node = hfs_bmap_new_bmap(node, idx); } else next_node = hfs_bnode_find(tree, nidx); @@ -316,7 +316,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); tree = node->tree; nidx = node->this; node = hfs_bnode_find(tree, 0); @@ -331,7 +331,8 @@ void hfs_bmap_free(struct hfs_bnode *node) hfs_bnode_put(node); if (!i) { /* panic */; - printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this); + pr_crit("unable to free bnode %u. bmap not found!\n", + node->this); return; } node = hfs_bnode_find(tree, i); @@ -339,7 +340,8 @@ void hfs_bmap_free(struct hfs_bnode *node) return; if (node->type != HFS_NODE_MAP) { /* panic */; - printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type); + pr_crit("invalid bmap found! (%u,%d)\n", + node->this, node->type); hfs_bnode_put(node); return; } @@ -352,7 +354,8 @@ void hfs_bmap_free(struct hfs_bnode *node) m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { - printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type); + pr_crit("trying to free free bnode %u(%d)\n", + node->this, node->type); kunmap(page); hfs_bnode_put(node); return; diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index 424b0337f524..ff0316b925a5 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -87,12 +87,15 @@ int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode * int entry_size; int err; - dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); + hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", + str->name, cnid, inode->i_nlink); if (dir->i_size >= HFS_MAX_VALENCE) return -ENOSPC; sb = dir->i_sb; - hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (err) + return err; hfs_cat_build_key(sb, fd.search_key, cnid, NULL); entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ? @@ -184,14 +187,14 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid, type = rec.type; if (type != HFS_CDR_THD && type != HFS_CDR_FTH) { - printk(KERN_ERR "hfs: found bad thread record in catalog\n"); + pr_err("found bad thread record in catalog\n"); return -EIO; } fd->search_key->cat.ParID = rec.thread.ParID; len = fd->search_key->cat.CName.len = rec.thread.CName.len; if (len > HFS_NAMELEN) { - printk(KERN_ERR "hfs: bad catalog namelength\n"); + pr_err("bad catalog namelength\n"); return -EIO; } memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len); @@ -212,9 +215,11 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str) struct list_head *pos; int res, type; - dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); sb = dir->i_sb; - hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (res) + return res; hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str); res = hfs_brec_find(&fd); @@ -278,10 +283,13 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name, int entry_size, type; int err; - dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, + hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); sb = src_dir->i_sb; - hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd); + err = hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd); + if (err) + return err; dst_fd = src_fd; /* find the old dir entry and read the data */ diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 5f7f1abd5f6d..145566851e7a 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -25,7 +25,9 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; int res; - hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + if (res) + return ERR_PTR(res); hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name); res = hfs_brec_read(&fd, &rec, sizeof(rec)); if (res) { @@ -49,9 +51,9 @@ done: /* * hfs_readdir */ -static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int hfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; int len, err; char strbuf[HFS_MAX_NAMELEN]; @@ -60,23 +62,24 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct hfs_readdir_data *rd; u16 type; - if (filp->f_pos >= inode->i_size) + if (ctx->pos >= inode->i_size) return 0; - hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (err) + return err; hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); err = hfs_brec_find(&fd); if (err) goto out; - switch ((u32)filp->f_pos) { - case 0: + if (ctx->pos == 0) { /* This is completely artificial... */ - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR)) + if (!dir_emit_dot(file, ctx)) goto out; - filp->f_pos++; - /* fall through */ - case 1: + ctx->pos = 1; + } + if (ctx->pos == 1) { if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { err = -EIO; goto out; @@ -84,31 +87,29 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); if (entry.type != HFS_CDR_THD) { - printk(KERN_ERR "hfs: bad catalog folder thread\n"); + pr_err("bad catalog folder thread\n"); err = -EIO; goto out; } //if (fd.entrylength < HFS_MIN_THREAD_SZ) { - // printk(KERN_ERR "hfs: truncated catalog thread\n"); + // pr_err("truncated catalog thread\n"); // err = -EIO; // goto out; //} - if (filldir(dirent, "..", 2, 1, + if (!dir_emit(ctx, "..", 2, be32_to_cpu(entry.thread.ParID), DT_DIR)) goto out; - filp->f_pos++; - /* fall through */ - default: - if (filp->f_pos >= inode->i_size) - goto out; - err = hfs_brec_goto(&fd, filp->f_pos - 1); - if (err) - goto out; + ctx->pos = 2; } + if (ctx->pos >= inode->i_size) + goto out; + err = hfs_brec_goto(&fd, ctx->pos - 1); + if (err) + goto out; for (;;) { if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) { - printk(KERN_ERR "hfs: walked past end of dir\n"); + pr_err("walked past end of dir\n"); err = -EIO; goto out; } @@ -123,43 +124,43 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName); if (type == HFS_CDR_DIR) { if (fd.entrylength < sizeof(struct hfs_cat_dir)) { - printk(KERN_ERR "hfs: small dir entry\n"); + pr_err("small dir entry\n"); err = -EIO; goto out; } - if (filldir(dirent, strbuf, len, filp->f_pos, + if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.dir.DirID), DT_DIR)) break; } else if (type == HFS_CDR_FIL) { if (fd.entrylength < sizeof(struct hfs_cat_file)) { - printk(KERN_ERR "hfs: small file entry\n"); + pr_err("small file entry\n"); err = -EIO; goto out; } - if (filldir(dirent, strbuf, len, filp->f_pos, + if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.file.FlNum), DT_REG)) break; } else { - printk(KERN_ERR "hfs: bad catalog entry type %d\n", type); + pr_err("bad catalog entry type %d\n", type); err = -EIO; goto out; } - filp->f_pos++; - if (filp->f_pos >= inode->i_size) + ctx->pos++; + if (ctx->pos >= inode->i_size) goto out; err = hfs_brec_goto(&fd, 1); if (err) goto out; } - rd = filp->private_data; + rd = file->private_data; if (!rd) { rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL); if (!rd) { err = -ENOMEM; goto out; } - filp->private_data = rd; - rd->file = filp; + file->private_data = rd; + rd->file = file; list_add(&rd->list, &HFS_I(inode)->open_dir_list); } memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key)); @@ -172,7 +173,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file) { struct hfs_readdir_data *rd = file->private_data; if (rd) { + mutex_lock(&inode->i_mutex); list_del(&rd->list); + mutex_unlock(&inode->i_mutex); kfree(rd); } return 0; @@ -300,7 +303,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, const struct file_operations hfs_dir_operations = { .read = generic_read_dir, - .readdir = hfs_readdir, + .iterate = hfs_readdir, .llseek = generic_file_llseek, .release = hfs_dir_release, }; diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index a67955a0c36f..e33a0d36a93e 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -107,7 +107,7 @@ static u16 hfs_ext_lastblock(struct hfs_extent *ext) return be16_to_cpu(ext->block) + be16_to_cpu(ext->count); } -static void __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) +static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) { int res; @@ -116,26 +116,31 @@ static void __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd res = hfs_brec_find(fd); if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) { if (res != -ENOENT) - return; + return res; hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec)); HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); } else { if (res) - return; + return res; hfs_bnode_write(fd->bnode, HFS_I(inode)->cached_extents, fd->entryoffset, fd->entrylength); HFS_I(inode)->flags &= ~HFS_FLG_EXT_DIRTY; } + return 0; } -void hfs_ext_write_extent(struct inode *inode) +int hfs_ext_write_extent(struct inode *inode) { struct hfs_find_data fd; + int res = 0; if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) { - hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); - __hfs_ext_write_extent(inode, &fd); + res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); + if (res) + return res; + res = __hfs_ext_write_extent(inode, &fd); hfs_find_exit(&fd); } + return res; } static inline int __hfs_ext_read_extent(struct hfs_find_data *fd, struct hfs_extent *extent, @@ -161,8 +166,11 @@ static inline int __hfs_ext_cache_extent(struct hfs_find_data *fd, struct inode { int res; - if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) - __hfs_ext_write_extent(inode, fd); + if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) { + res = __hfs_ext_write_extent(inode, fd); + if (res) + return res; + } res = __hfs_ext_read_extent(fd, HFS_I(inode)->cached_extents, inode->i_ino, block, HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA); @@ -185,9 +193,11 @@ static int hfs_ext_read_extent(struct inode *inode, u16 block) block < HFS_I(inode)->cached_start + HFS_I(inode)->cached_blocks) return 0; - hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); - res = __hfs_ext_cache_extent(&fd, inode, block); - hfs_find_exit(&fd); + res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); + if (!res) { + res = __hfs_ext_cache_extent(&fd, inode, block); + hfs_find_exit(&fd); + } return res; } @@ -195,11 +205,12 @@ static void hfs_dump_extent(struct hfs_extent *extent) { int i; - dprint(DBG_EXTENT, " "); + hfs_dbg(EXTENT, " "); for (i = 0; i < 3; i++) - dprint(DBG_EXTENT, " %u:%u", be16_to_cpu(extent[i].block), - be16_to_cpu(extent[i].count)); - dprint(DBG_EXTENT, "\n"); + hfs_dbg_cont(EXTENT, " %u:%u", + be16_to_cpu(extent[i].block), + be16_to_cpu(extent[i].count)); + hfs_dbg_cont(EXTENT, "\n"); } static int hfs_add_extent(struct hfs_extent *extent, u16 offset, @@ -298,7 +309,9 @@ int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type) if (total_blocks == blocks) return 0; - hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + if (res) + return res; do { res = __hfs_ext_read_extent(&fd, extent, cnid, total_blocks, type); if (res) @@ -392,10 +405,10 @@ int hfs_extend_file(struct inode *inode) goto out; } - dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) { if (!HFS_I(inode)->first_blocks) { - dprint(DBG_EXTENT, "first extents\n"); + hfs_dbg(EXTENT, "first extents\n"); /* no extents yet */ HFS_I(inode)->first_extents[0].block = cpu_to_be16(start); HFS_I(inode)->first_extents[0].count = cpu_to_be16(len); @@ -437,8 +450,10 @@ out: return res; insert_extent: - dprint(DBG_EXTENT, "insert new extent\n"); - hfs_ext_write_extent(inode); + hfs_dbg(EXTENT, "insert new extent\n"); + res = hfs_ext_write_extent(inode); + if (res) + goto out; memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec)); HFS_I(inode)->cached_extents[0].block = cpu_to_be16(start); @@ -460,13 +475,13 @@ void hfs_file_truncate(struct inode *inode) u32 size; int res; - dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino, - (long long)HFS_I(inode)->phys_size, inode->i_size); + hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n", + inode->i_ino, (long long)HFS_I(inode)->phys_size, + inode->i_size); if (inode->i_size > HFS_I(inode)->phys_size) { struct address_space *mapping = inode->i_mapping; void *fsdata; struct page *page; - int res; /* XXX: Can use generic_cont_expand? */ size = inode->i_size - 1; @@ -488,7 +503,12 @@ void hfs_file_truncate(struct inode *inode) goto out; mutex_lock(&HFS_I(inode)->extents_lock); - hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + if (res) { + mutex_unlock(&HFS_I(inode)->extents_lock); + /* XXX: We lack error handling of hfs_file_truncate() */ + return; + } while (1) { if (alloc_cnt == HFS_I(inode)->first_blocks) { hfs_free_extents(sb, HFS_I(inode)->first_extents, diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 693df9fe52b2..0524cda47a6e 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -9,6 +9,12 @@ #ifndef _LINUX_HFS_FS_H #define _LINUX_HFS_FS_H +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/types.h> #include <linux/mutex.h> @@ -34,8 +40,18 @@ //#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) #define DBG_MASK (0) -#define dprint(flg, fmt, args...) \ - if (flg & DBG_MASK) printk(fmt , ## args) +#define hfs_dbg(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ +} while (0) + +#define hfs_dbg_cont(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + pr_cont(fmt, ##__VA_ARGS__); \ +} while (0) + /* * struct hfs_inode_info @@ -174,7 +190,7 @@ extern const struct inode_operations hfs_dir_inode_operations; /* extent.c */ extern int hfs_ext_keycmp(const btree_key *, const btree_key *); extern int hfs_free_fork(struct super_block *, struct hfs_cat_file *, int); -extern void hfs_ext_write_extent(struct inode *); +extern int hfs_ext_write_extent(struct inode *); extern int hfs_extend_file(struct inode *); extern void hfs_file_truncate(struct inode *); @@ -213,13 +229,10 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *); /* string.c */ extern const struct dentry_operations hfs_dentry_operations; -extern int hfs_hash_dentry(const struct dentry *, const struct inode *, - struct qstr *); +extern int hfs_hash_dentry(const struct dentry *, struct qstr *); extern int hfs_strcmp(const unsigned char *, unsigned int, const unsigned char *, unsigned int); -extern int hfs_compare_dentry(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +extern int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); /* trans.c */ diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 3031dfdd2358..f9299d8a64e3 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> +#include <linux/aio.h> #include "hfs_fs.h" #include "btree.h" @@ -237,7 +238,7 @@ void hfs_delete_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; - dprint(DBG_INODE, "delete_inode: %lu\n", inode->i_ino); + hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino); if (S_ISDIR(inode->i_mode)) { HFS_SB(sb)->folder_count--; if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) @@ -416,9 +417,12 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) struct inode *main_inode = inode; struct hfs_find_data fd; hfs_cat_rec rec; + int res; - dprint(DBG_INODE, "hfs_write_inode: %lu\n", inode->i_ino); - hfs_ext_write_extent(inode); + hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino); + res = hfs_ext_write_extent(inode); + if (res) + return res; if (inode->i_ino < HFS_FIRSTUSER_CNID) { switch (inode->i_ino) { @@ -515,7 +519,11 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry, if (!inode) return ERR_PTR(-ENOMEM); - hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + if (res) { + iput(inode); + return ERR_PTR(res); + } fd.search_key->cat = HFS_I(dir)->cat_key; res = hfs_brec_read(&fd, &rec, sizeof(rec)); if (!res) { diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index b7ec224910c5..aa3f0d6d043c 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -48,7 +48,7 @@ static int hfs_get_last_session(struct super_block *sb, *start = (sector_t)te.cdte_addr.lba << 2; return 0; } - printk(KERN_ERR "hfs: invalid session number or type of track\n"); + pr_err("invalid session number or type of track\n"); return -EINVAL; } ms_info.addr_format = CDROM_LBA; @@ -101,7 +101,7 @@ int hfs_mdb_get(struct super_block *sb) HFS_SB(sb)->alloc_blksz = size = be32_to_cpu(mdb->drAlBlkSiz); if (!size || (size & (HFS_SECTOR_SIZE - 1))) { - printk(KERN_ERR "hfs: bad allocation block size %d\n", size); + pr_err("bad allocation block size %d\n", size); goto out_bh; } @@ -118,7 +118,7 @@ int hfs_mdb_get(struct super_block *sb) size >>= 1; brelse(bh); if (!sb_set_blocksize(sb, size)) { - printk(KERN_ERR "hfs: unable to set blocksize to %u\n", size); + pr_err("unable to set blocksize to %u\n", size); goto out; } @@ -162,8 +162,8 @@ int hfs_mdb_get(struct super_block *sb) } if (!HFS_SB(sb)->alt_mdb) { - printk(KERN_WARNING "hfs: unable to locate alternate MDB\n"); - printk(KERN_WARNING "hfs: continuing without an alternate MDB\n"); + pr_warn("unable to locate alternate MDB\n"); + pr_warn("continuing without an alternate MDB\n"); } HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0); @@ -178,7 +178,7 @@ int hfs_mdb_get(struct super_block *sb) while (size) { bh = sb_bread(sb, off >> sb->s_blocksize_bits); if (!bh) { - printk(KERN_ERR "hfs: unable to read volume bitmap\n"); + pr_err("unable to read volume bitmap\n"); goto out; } off2 = off & (sb->s_blocksize - 1); @@ -192,23 +192,22 @@ int hfs_mdb_get(struct super_block *sb) HFS_SB(sb)->ext_tree = hfs_btree_open(sb, HFS_EXT_CNID, hfs_ext_keycmp); if (!HFS_SB(sb)->ext_tree) { - printk(KERN_ERR "hfs: unable to open extent tree\n"); + pr_err("unable to open extent tree\n"); goto out; } HFS_SB(sb)->cat_tree = hfs_btree_open(sb, HFS_CAT_CNID, hfs_cat_keycmp); if (!HFS_SB(sb)->cat_tree) { - printk(KERN_ERR "hfs: unable to open catalog tree\n"); + pr_err("unable to open catalog tree\n"); goto out; } attrib = mdb->drAtrb; if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { - printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, " - "running fsck.hfs is recommended. mounting read-only.\n"); + pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) { - printk(KERN_WARNING "hfs: filesystem is marked locked, mounting read-only.\n"); + pr_warn("filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } if (!(sb->s_flags & MS_RDONLY)) { @@ -312,7 +311,7 @@ void hfs_mdb_commit(struct super_block *sb) while (size) { bh = sb_bread(sb, block); if (!bh) { - printk(KERN_ERR "hfs: unable to read volume bitmap\n"); + pr_err("unable to read volume bitmap\n"); break; } len = min((int)sb->s_blocksize - off, size); diff --git a/fs/hfs/string.c b/fs/hfs/string.c index 495a976a3cc9..85b610c3909f 100644 --- a/fs/hfs/string.c +++ b/fs/hfs/string.c @@ -51,8 +51,7 @@ static unsigned char caseorder[256] = { /* * Hash a string to an integer in a case-independent way */ -int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, - struct qstr *this) +int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this) { const unsigned char *name = this->name; unsigned int hash, len = this->len; @@ -93,8 +92,7 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1, * Test for equality of two strings in the HFS filename character ordering. * return 1 on failure and 0 on success */ -int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { const unsigned char *n1, *n2; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index bbaaa8a4ee64..2d2039e754cd 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -117,12 +117,11 @@ static int hfs_remount(struct super_block *sb, int *flags, char *data) return 0; if (!(*flags & MS_RDONLY)) { if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { - printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, " - "running fsck.hfs is recommended. leaving read-only.\n"); + pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) { - printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); + pr_warn("filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } @@ -253,29 +252,29 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) switch (token) { case opt_uid: if (match_int(&args[0], &tmp)) { - printk(KERN_ERR "hfs: uid requires an argument\n"); + pr_err("uid requires an argument\n"); return 0; } hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp); if (!uid_valid(hsb->s_uid)) { - printk(KERN_ERR "hfs: invalid uid %d\n", tmp); + pr_err("invalid uid %d\n", tmp); return 0; } break; case opt_gid: if (match_int(&args[0], &tmp)) { - printk(KERN_ERR "hfs: gid requires an argument\n"); + pr_err("gid requires an argument\n"); return 0; } hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp); if (!gid_valid(hsb->s_gid)) { - printk(KERN_ERR "hfs: invalid gid %d\n", tmp); + pr_err("invalid gid %d\n", tmp); return 0; } break; case opt_umask: if (match_octal(&args[0], &tmp)) { - printk(KERN_ERR "hfs: umask requires a value\n"); + pr_err("umask requires a value\n"); return 0; } hsb->s_file_umask = (umode_t)tmp; @@ -283,39 +282,39 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) break; case opt_file_umask: if (match_octal(&args[0], &tmp)) { - printk(KERN_ERR "hfs: file_umask requires a value\n"); + pr_err("file_umask requires a value\n"); return 0; } hsb->s_file_umask = (umode_t)tmp; break; case opt_dir_umask: if (match_octal(&args[0], &tmp)) { - printk(KERN_ERR "hfs: dir_umask requires a value\n"); + pr_err("dir_umask requires a value\n"); return 0; } hsb->s_dir_umask = (umode_t)tmp; break; case opt_part: if (match_int(&args[0], &hsb->part)) { - printk(KERN_ERR "hfs: part requires an argument\n"); + pr_err("part requires an argument\n"); return 0; } break; case opt_session: if (match_int(&args[0], &hsb->session)) { - printk(KERN_ERR "hfs: session requires an argument\n"); + pr_err("session requires an argument\n"); return 0; } break; case opt_type: if (match_fourchar(&args[0], &hsb->s_type)) { - printk(KERN_ERR "hfs: type requires a 4 character value\n"); + pr_err("type requires a 4 character value\n"); return 0; } break; case opt_creator: if (match_fourchar(&args[0], &hsb->s_creator)) { - printk(KERN_ERR "hfs: creator requires a 4 character value\n"); + pr_err("creator requires a 4 character value\n"); return 0; } break; @@ -324,14 +323,14 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) break; case opt_codepage: if (hsb->nls_disk) { - printk(KERN_ERR "hfs: unable to change codepage\n"); + pr_err("unable to change codepage\n"); return 0; } p = match_strdup(&args[0]); if (p) hsb->nls_disk = load_nls(p); if (!hsb->nls_disk) { - printk(KERN_ERR "hfs: unable to load codepage \"%s\"\n", p); + pr_err("unable to load codepage \"%s\"\n", p); kfree(p); return 0; } @@ -339,14 +338,14 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) break; case opt_iocharset: if (hsb->nls_io) { - printk(KERN_ERR "hfs: unable to change iocharset\n"); + pr_err("unable to change iocharset\n"); return 0; } p = match_strdup(&args[0]); if (p) hsb->nls_io = load_nls(p); if (!hsb->nls_io) { - printk(KERN_ERR "hfs: unable to load iocharset \"%s\"\n", p); + pr_err("unable to load iocharset \"%s\"\n", p); kfree(p); return 0; } @@ -360,7 +359,7 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) if (hsb->nls_disk && !hsb->nls_io) { hsb->nls_io = load_nls_default(); if (!hsb->nls_io) { - printk(KERN_ERR "hfs: unable to load default iocharset\n"); + pr_err("unable to load default iocharset\n"); return 0; } } @@ -400,7 +399,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) res = -EINVAL; if (!parse_options((char *)data, sbi)) { - printk(KERN_ERR "hfs: unable to parse mount options.\n"); + pr_err("unable to parse mount options\n"); goto bail; } @@ -411,14 +410,16 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) res = hfs_mdb_get(sb); if (res) { if (!silent) - printk(KERN_WARNING "hfs: can't find a HFS filesystem on dev %s.\n", + pr_warn("can't find a HFS filesystem on dev %s\n", hfs_mdb_name(sb)); res = -EINVAL; goto bail; } /* try to get the root inode */ - hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (res) + goto bail_no_root; res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd); if (!res) { if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) { @@ -447,7 +448,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) return 0; bail_no_root: - printk(KERN_ERR "hfs: get root inode failed.\n"); + pr_err("get root inode failed\n"); bail: hfs_mdb_put(sb); return res; diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index 8d691f124714..0f47890299c4 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -56,7 +56,7 @@ int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key, if (name) { len = strlen(name); if (len > HFSPLUS_ATTR_MAX_STRLEN) { - printk(KERN_ERR "hfs: invalid xattr name's length\n"); + pr_err("invalid xattr name's length\n"); return -EINVAL; } hfsplus_asc2uni(sb, @@ -166,10 +166,10 @@ int hfsplus_find_attr(struct super_block *sb, u32 cnid, { int err = 0; - dprint(DBG_ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); + hfs_dbg(ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); if (!HFSPLUS_SB(sb)->attr_tree) { - printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + pr_err("attributes file doesn't exist\n"); return -EINVAL; } @@ -228,11 +228,11 @@ int hfsplus_create_attr(struct inode *inode, int entry_size; int err; - dprint(DBG_ATTR_MOD, "create_attr: %s,%ld\n", + hfs_dbg(ATTR_MOD, "create_attr: %s,%ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { - printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + pr_err("attributes file doesn't exist\n"); return -EINVAL; } @@ -307,10 +307,10 @@ static int __hfsplus_delete_attr(struct inode *inode, u32 cnid, break; case HFSPLUS_ATTR_FORK_DATA: case HFSPLUS_ATTR_EXTENTS: - printk(KERN_ERR "hfs: only inline data xattr are supported\n"); + pr_err("only inline data xattr are supported\n"); return -EOPNOTSUPP; default: - printk(KERN_ERR "hfs: invalid extended attribute record\n"); + pr_err("invalid extended attribute record\n"); return -ENOENT; } @@ -328,11 +328,11 @@ int hfsplus_delete_attr(struct inode *inode, const char *name) struct super_block *sb = inode->i_sb; struct hfs_find_data fd; - dprint(DBG_ATTR_MOD, "delete_attr: %s,%ld\n", + hfs_dbg(ATTR_MOD, "delete_attr: %s,%ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { - printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + pr_err("attributes file doesn't exist\n"); return -EINVAL; } @@ -346,7 +346,7 @@ int hfsplus_delete_attr(struct inode *inode, const char *name) if (err) goto out; } else { - printk(KERN_ERR "hfs: invalid extended attribute name\n"); + pr_err("invalid extended attribute name\n"); err = -EINVAL; goto out; } @@ -369,10 +369,10 @@ int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) int err = 0; struct hfs_find_data fd; - dprint(DBG_ATTR_MOD, "delete_all_attrs: %d\n", cnid); + hfs_dbg(ATTR_MOD, "delete_all_attrs: %d\n", cnid); if (!HFSPLUS_SB(dir->i_sb)->attr_tree) { - printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + pr_err("attributes file doesn't exist\n"); return -EINVAL; } @@ -384,7 +384,7 @@ int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) err = hfsplus_find_attr(dir->i_sb, cnid, NULL, &fd); if (err) { if (err != -ENOENT) - printk(KERN_ERR "hfs: xattr search failed.\n"); + pr_err("xattr search failed\n"); goto end_delete_all; } diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index d73c98d1ee99..c1422d91cd36 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -22,7 +22,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", + hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); switch (tree->cnid) { case HFSPLUS_CAT_CNID: @@ -44,7 +44,7 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", + hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; @@ -56,7 +56,8 @@ int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, int *end, int *cur_rec) { - __be32 cur_cnid, search_cnid; + __be32 cur_cnid; + __be32 search_cnid; if (bnode->tree->cnid == HFSPLUS_EXT_CNID) { cur_cnid = fd->key->ext.cnid; @@ -67,8 +68,11 @@ int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, } else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) { cur_cnid = fd->key->attr.cnid; search_cnid = fd->search_key->attr.cnid; - } else + } else { + cur_cnid = 0; /* used-uninitialized warning */ + search_cnid = 0; BUG(); + } if (cur_cnid == search_cnid) { (*end) = (*cur_rec); @@ -204,7 +208,7 @@ int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare) return res; invalid: - printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", + pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", height, bnode->height, bnode->type, nidx, parent); res = -EIO; release: diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index 6feefc0cb48a..d2954451519e 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -30,7 +30,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, if (!len) return size; - dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); + hfs_dbg(BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); mutex_lock(&sbi->alloc_mutex); mapping = sbi->alloc_file->i_mapping; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); @@ -89,14 +89,14 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, else end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32; } - dprint(DBG_BITMAP, "bitmap full\n"); + hfs_dbg(BITMAP, "bitmap full\n"); start = size; goto out; found: start = offset + (curr - pptr) * 32 + i; if (start >= size) { - dprint(DBG_BITMAP, "bitmap full\n"); + hfs_dbg(BITMAP, "bitmap full\n"); goto out; } /* do any partial u32 at the start */ @@ -154,7 +154,7 @@ done: *max = offset + (curr - pptr) * 32 + i - start; sbi->free_blocks -= *max; hfsplus_mark_mdb_dirty(sb); - dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); + hfs_dbg(BITMAP, "-> %u,%u\n", start, *max); out: mutex_unlock(&sbi->alloc_mutex); return start; @@ -173,7 +173,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) if (!count) return 0; - dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); + hfs_dbg(BITMAP, "block_free: %u,%u\n", offset, count); /* are all of the bits in range? */ if ((offset + count) > sbi->total_blocks) return -ENOENT; @@ -238,8 +238,7 @@ out: return 0; kaboom: - printk(KERN_CRIT "hfsplus: unable to mark blocks free: error %ld\n", - PTR_ERR(page)); + pr_crit("unable to mark blocks free: error %ld\n", PTR_ERR(page)); mutex_unlock(&sbi->alloc_mutex); return -EIO; diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index f31ac6f404f1..11c860204520 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -130,7 +130,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct page **src_page, **dst_page; int l; - dprint(DBG_BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; tree = src_node->tree; @@ -188,7 +188,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) struct page **src_page, **dst_page; int l; - dprint(DBG_BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); if (!len) return; src += node->page_offset; @@ -302,16 +302,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - dprint(DBG_BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - dprint(DBG_BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - dprint(DBG_BNODE_MOD, " %d", key_off); + hfs_dbg(BNODE_MOD, " %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -320,17 +320,17 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = hfs_bnode_read_u16(node, key_off) + 2; else tmp = node->tree->max_key_len + 2; - dprint(DBG_BNODE_MOD, " (%d", tmp); + hfs_dbg_cont(BNODE_MOD, " (%d", tmp); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - dprint(DBG_BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u16(node, key_off); - dprint(DBG_BNODE_MOD, " (%d)", tmp); + hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); } } - dprint(DBG_BNODE_MOD, "\n"); + hfs_dbg_cont(BNODE_MOD, "\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -366,7 +366,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node) /* move down? */ if (!node->prev && !node->next) - dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n"); + hfs_dbg(BNODE_MOD, "hfs_btree_del_level\n"); if (!node->parent) { tree->root = 0; tree->depth = 0; @@ -386,7 +386,7 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) struct hfs_bnode *node; if (cnid >= tree->node_count) { - printk(KERN_ERR "hfs: request for non-existent node " + pr_err("request for non-existent node " "%d in B*Tree\n", cnid); return NULL; @@ -409,7 +409,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) loff_t off; if (cnid >= tree->node_count) { - printk(KERN_ERR "hfs: request for non-existent node " + pr_err("request for non-existent node " "%d in B*Tree\n", cnid); return NULL; @@ -425,8 +425,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - dprint(DBG_BNODE_REFS, "new_node(%d:%d): 1\n", - node->tree->cnid, node->this); + hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); node2 = hfs_bnode_findhash(tree, cnid); @@ -470,7 +470,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - dprint(DBG_BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -588,7 +588,7 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) node = hfs_bnode_findhash(tree, num); spin_unlock(&tree->hash_lock); if (node) { - printk(KERN_CRIT "new node %u already hashed?\n", num); + pr_crit("new node %u already hashed?\n", num); WARN_ON(1); return node; } @@ -620,7 +620,7 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n", + hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); } @@ -633,7 +633,7 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n", + hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 298d4e45604b..6e560d56094b 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -45,13 +45,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) if (!recoff) return 0; if (recoff > node->tree->node_size - 2) { - printk(KERN_ERR "hfs: recoff %d too large\n", recoff); + pr_err("recoff %d too large\n", recoff); return 0; } retval = hfs_bnode_read_u16(node, recoff) + 2; if (retval > node->tree->max_key_len + 2) { - printk(KERN_ERR "hfs: keylen %d too large\n", + pr_err("keylen %d too large\n", retval); retval = 0; } @@ -90,7 +90,7 @@ again: end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) @@ -191,7 +191,7 @@ again: mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", + hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); @@ -244,7 +244,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - dprint(DBG_BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -379,7 +379,7 @@ again: newkeylen = hfs_bnode_read_u16(node, 14) + 2; else fd->keylength = newkeylen = tree->max_key_len + 2; - dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", + hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; @@ -391,7 +391,7 @@ again: end_off = hfs_bnode_read_u16(parent, end_rec_off); if (end_rec_off - end_off < diff) { - dprint(DBG_BNODE_MOD, "hfs: splitting index node.\n"); + hfs_dbg(BNODE_MOD, "splitting index node\n"); fd->bnode = parent; new_node = hfs_bnode_split(fd); if (IS_ERR(new_node)) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index efb689c21a95..0c6540c91167 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -40,8 +40,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) tree->inode = inode; if (!HFSPLUS_I(tree->inode)->first_blocks) { - printk(KERN_ERR - "hfs: invalid btree extent records (0 size).\n"); + pr_err("invalid btree extent records (0 size)\n"); goto free_inode; } @@ -68,12 +67,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) switch (id) { case HFSPLUS_EXT_CNID: if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) { - printk(KERN_ERR "hfs: invalid extent max_key_len %d\n", + pr_err("invalid extent max_key_len %d\n", tree->max_key_len); goto fail_page; } if (tree->attributes & HFS_TREE_VARIDXKEYS) { - printk(KERN_ERR "hfs: invalid extent btree flag\n"); + pr_err("invalid extent btree flag\n"); goto fail_page; } @@ -81,12 +80,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) break; case HFSPLUS_CAT_CNID: if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) { - printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n", + pr_err("invalid catalog max_key_len %d\n", tree->max_key_len); goto fail_page; } if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) { - printk(KERN_ERR "hfs: invalid catalog btree flag\n"); + pr_err("invalid catalog btree flag\n"); goto fail_page; } @@ -100,19 +99,19 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) break; case HFSPLUS_ATTR_CNID: if (tree->max_key_len != HFSPLUS_ATTR_KEYLEN - sizeof(u16)) { - printk(KERN_ERR "hfs: invalid attributes max_key_len %d\n", + pr_err("invalid attributes max_key_len %d\n", tree->max_key_len); goto fail_page; } tree->keycmp = hfsplus_attr_bin_cmp_key; break; default: - printk(KERN_ERR "hfs: unknown B*Tree requested\n"); + pr_err("unknown B*Tree requested\n"); goto fail_page; } if (!(tree->attributes & HFS_TREE_BIGKEYS)) { - printk(KERN_ERR "hfs: invalid btree flag\n"); + pr_err("invalid btree flag\n"); goto fail_page; } @@ -155,7 +154,7 @@ void hfs_btree_close(struct hfs_btree *tree) while ((node = tree->node_hash[i])) { tree->node_hash[i] = node->next_hash; if (atomic_read(&node->refcnt)) - printk(KERN_CRIT "hfs: node %d:%d " + pr_crit("node %d:%d " "still has %d user(s)!\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); @@ -303,7 +302,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) kunmap(*pagep); nidx = node->next; if (!nidx) { - dprint(DBG_BNODE_MOD, "hfs: create new bmap node.\n"); + hfs_dbg(BNODE_MOD, "create new bmap node\n"); next_node = hfs_bmap_new_bmap(node, idx); } else next_node = hfs_bnode_find(tree, nidx); @@ -329,7 +328,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); BUG_ON(!node->this); tree = node->tree; nidx = node->this; @@ -345,7 +344,7 @@ void hfs_bmap_free(struct hfs_bnode *node) hfs_bnode_put(node); if (!i) { /* panic */; - printk(KERN_CRIT "hfs: unable to free bnode %u. " + pr_crit("unable to free bnode %u. " "bmap not found!\n", node->this); return; @@ -355,7 +354,7 @@ void hfs_bmap_free(struct hfs_bnode *node) return; if (node->type != HFS_NODE_MAP) { /* panic */; - printk(KERN_CRIT "hfs: invalid bmap found! " + pr_crit("invalid bmap found! " "(%u,%d)\n", node->this, node->type); hfs_bnode_put(node); @@ -370,7 +369,7 @@ void hfs_bmap_free(struct hfs_bnode *node) m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { - printk(KERN_CRIT "hfs: trying to free free bnode " + pr_crit("trying to free free bnode " "%u(%d)\n", node->this, node->type); kunmap(page); diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 840d71edd193..968ce411db53 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -188,12 +188,12 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, type = be16_to_cpu(tmp.type); if (type != HFSPLUS_FOLDER_THREAD && type != HFSPLUS_FILE_THREAD) { - printk(KERN_ERR "hfs: found bad thread record in catalog\n"); + pr_err("found bad thread record in catalog\n"); return -EIO; } if (be16_to_cpu(tmp.thread.nodeName.length) > 255) { - printk(KERN_ERR "hfs: catalog name length corrupted\n"); + pr_err("catalog name length corrupted\n"); return -EIO; } @@ -212,7 +212,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, int entry_size; int err; - dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", + hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) @@ -271,8 +271,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) int err, off; u16 type; - dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", - str ? str->name : NULL, cnid); + hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) return err; @@ -361,7 +360,7 @@ int hfsplus_rename_cat(u32 cnid, int entry_size, type; int err; - dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 031c24e50521..d8ce4bd17fc5 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -103,7 +103,7 @@ again: } else if (!dentry->d_fsdata) dentry->d_fsdata = (void *)(unsigned long)cnid; } else { - printk(KERN_ERR "hfs: invalid catalog entry type in lookup\n"); + pr_err("invalid catalog entry type in lookup\n"); err = -EIO; goto fail; } @@ -121,9 +121,9 @@ fail: return ERR_PTR(err); } -static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int hfsplus_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; int len, err; char strbuf[HFSPLUS_MAX_STRLEN + 1]; @@ -132,7 +132,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) struct hfsplus_readdir_data *rd; u16 type; - if (filp->f_pos >= inode->i_size) + if (file->f_pos >= inode->i_size) return 0; err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); @@ -143,14 +143,13 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) if (err) goto out; - switch ((u32)filp->f_pos) { - case 0: + if (ctx->pos == 0) { /* This is completely artificial... */ - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR)) + if (!dir_emit_dot(file, ctx)) goto out; - filp->f_pos++; - /* fall through */ - case 1: + ctx->pos = 1; + } + if (ctx->pos == 1) { if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { err = -EIO; goto out; @@ -159,31 +158,28 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) { - printk(KERN_ERR "hfs: bad catalog folder thread\n"); + pr_err("bad catalog folder thread\n"); err = -EIO; goto out; } if (fd.entrylength < HFSPLUS_MIN_THREAD_SZ) { - printk(KERN_ERR "hfs: truncated catalog thread\n"); + pr_err("truncated catalog thread\n"); err = -EIO; goto out; } - if (filldir(dirent, "..", 2, 1, + if (!dir_emit(ctx, "..", 2, be32_to_cpu(entry.thread.parentID), DT_DIR)) goto out; - filp->f_pos++; - /* fall through */ - default: - if (filp->f_pos >= inode->i_size) - goto out; - err = hfs_brec_goto(&fd, filp->f_pos - 1); - if (err) - goto out; + ctx->pos = 2; } - + if (ctx->pos >= inode->i_size) + goto out; + err = hfs_brec_goto(&fd, ctx->pos - 1); + if (err) + goto out; for (;;) { if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) { - printk(KERN_ERR "hfs: walked past end of dir\n"); + pr_err("walked past end of dir\n"); err = -EIO; goto out; } @@ -203,7 +199,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) if (type == HFSPLUS_FOLDER) { if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) { - printk(KERN_ERR "hfs: small dir entry\n"); + pr_err("small dir entry\n"); err = -EIO; goto out; } @@ -211,40 +207,40 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) HFSPLUS_SB(sb)->hidden_dir->i_ino == be32_to_cpu(entry.folder.id)) goto next; - if (filldir(dirent, strbuf, len, filp->f_pos, + if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.folder.id), DT_DIR)) break; } else if (type == HFSPLUS_FILE) { if (fd.entrylength < sizeof(struct hfsplus_cat_file)) { - printk(KERN_ERR "hfs: small file entry\n"); + pr_err("small file entry\n"); err = -EIO; goto out; } - if (filldir(dirent, strbuf, len, filp->f_pos, + if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.file.id), DT_REG)) break; } else { - printk(KERN_ERR "hfs: bad catalog entry type\n"); + pr_err("bad catalog entry type\n"); err = -EIO; goto out; } next: - filp->f_pos++; - if (filp->f_pos >= inode->i_size) + ctx->pos++; + if (ctx->pos >= inode->i_size) goto out; err = hfs_brec_goto(&fd, 1); if (err) goto out; } - rd = filp->private_data; + rd = file->private_data; if (!rd) { rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL); if (!rd) { err = -ENOMEM; goto out; } - filp->private_data = rd; - rd->file = filp; + file->private_data = rd; + rd->file = file; list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list); } memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); @@ -538,7 +534,7 @@ const struct inode_operations hfsplus_dir_inode_operations = { const struct file_operations hfsplus_dir_operations = { .fsync = hfsplus_file_fsync, .read = generic_read_dir, - .readdir = hfsplus_readdir, + .iterate = hfsplus_readdir, .unlocked_ioctl = hfsplus_ioctl, .llseek = generic_file_llseek, .release = hfsplus_dir_release, diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index fe0a76213d9e..fbb212fbb1ef 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -83,7 +83,7 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext) return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count); } -static void __hfsplus_ext_write_extent(struct inode *inode, +static int __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) { struct hfsplus_inode_info *hip = HFSPLUS_I(inode); @@ -98,13 +98,13 @@ static void __hfsplus_ext_write_extent(struct inode *inode, res = hfs_brec_find(fd, hfs_find_rec_by_key); if (hip->extent_state & HFSPLUS_EXT_NEW) { if (res != -ENOENT) - return; + return res; hfs_brec_insert(fd, hip->cached_extents, sizeof(hfsplus_extent_rec)); hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); } else { if (res) - return; + return res; hfs_bnode_write(fd->bnode, hip->cached_extents, fd->entryoffset, fd->entrylength); hip->extent_state &= ~HFSPLUS_EXT_DIRTY; @@ -117,11 +117,13 @@ static void __hfsplus_ext_write_extent(struct inode *inode, * to explicily mark the inode dirty, too. */ set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags); + + return 0; } static int hfsplus_ext_write_extent_locked(struct inode *inode) { - int res; + int res = 0; if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) { struct hfs_find_data fd; @@ -129,10 +131,10 @@ static int hfsplus_ext_write_extent_locked(struct inode *inode) res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); if (res) return res; - __hfsplus_ext_write_extent(inode, &fd); + res = __hfsplus_ext_write_extent(inode, &fd); hfs_find_exit(&fd); } - return 0; + return res; } int hfsplus_ext_write_extent(struct inode *inode) @@ -175,8 +177,11 @@ static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, WARN_ON(!mutex_is_locked(&hip->extents_lock)); - if (hip->extent_state & HFSPLUS_EXT_DIRTY) - __hfsplus_ext_write_extent(inode, fd); + if (hip->extent_state & HFSPLUS_EXT_DIRTY) { + res = __hfsplus_ext_write_extent(inode, fd); + if (res) + return res; + } res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino, block, HFSPLUS_IS_RSRC(inode) ? @@ -265,7 +270,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, mutex_unlock(&hip->extents_lock); done: - dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", + hfs_dbg(EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); mask = (1 << sbi->fs_shift) - 1; @@ -288,11 +293,12 @@ static void hfsplus_dump_extent(struct hfsplus_extent *extent) { int i; - dprint(DBG_EXTENT, " "); + hfs_dbg(EXTENT, " "); for (i = 0; i < 8; i++) - dprint(DBG_EXTENT, " %u:%u", be32_to_cpu(extent[i].start_block), - be32_to_cpu(extent[i].block_count)); - dprint(DBG_EXTENT, "\n"); + hfs_dbg_cont(EXTENT, " %u:%u", + be32_to_cpu(extent[i].start_block), + be32_to_cpu(extent[i].block_count)); + hfs_dbg_cont(EXTENT, "\n"); } static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset, @@ -348,8 +354,8 @@ found: if (count <= block_nr) { err = hfsplus_block_free(sb, start, count); if (err) { - printk(KERN_ERR "hfs: can't free extent\n"); - dprint(DBG_EXTENT, " start: %u count: %u\n", + pr_err("can't free extent\n"); + hfs_dbg(EXTENT, " start: %u count: %u\n", start, count); } extent->block_count = 0; @@ -359,8 +365,8 @@ found: count -= block_nr; err = hfsplus_block_free(sb, start + count, block_nr); if (err) { - printk(KERN_ERR "hfs: can't free extent\n"); - dprint(DBG_EXTENT, " start: %u count: %u\n", + pr_err("can't free extent\n"); + hfs_dbg(EXTENT, " start: %u count: %u\n", start, count); } extent->block_count = cpu_to_be32(count); @@ -432,7 +438,7 @@ int hfsplus_file_extend(struct inode *inode) if (sbi->alloc_file->i_size * 8 < sbi->total_blocks - sbi->free_blocks + 8) { /* extend alloc file */ - printk(KERN_ERR "hfs: extend alloc file! " + pr_err("extend alloc file! " "(%llu,%u,%u)\n", sbi->alloc_file->i_size * 8, sbi->total_blocks, sbi->free_blocks); @@ -459,11 +465,11 @@ int hfsplus_file_extend(struct inode *inode) } } - dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); if (hip->alloc_blocks <= hip->first_blocks) { if (!hip->first_blocks) { - dprint(DBG_EXTENT, "first extents\n"); + hfs_dbg(EXTENT, "first extents\n"); /* no extents yet */ hip->first_extents[0].start_block = cpu_to_be32(start); hip->first_extents[0].block_count = cpu_to_be32(len); @@ -500,7 +506,7 @@ out: return res; insert_extent: - dprint(DBG_EXTENT, "insert new extent\n"); + hfs_dbg(EXTENT, "insert new extent\n"); res = hfsplus_ext_write_extent_locked(inode); if (res) goto out; @@ -525,9 +531,8 @@ void hfsplus_file_truncate(struct inode *inode) u32 alloc_cnt, blk_cnt, start; int res; - dprint(DBG_INODE, "truncate: %lu, %llu -> %llu\n", - inode->i_ino, (long long)hip->phys_size, - inode->i_size); + hfs_dbg(INODE, "truncate: %lu, %llu -> %llu\n", + inode->i_ino, (long long)hip->phys_size, inode->i_size); if (inode->i_size > hip->phys_size) { struct address_space *mapping = inode->i_mapping; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 05b11f36024c..ede79317cfb8 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -10,6 +10,12 @@ #ifndef _LINUX_HFSPLUS_FS_H #define _LINUX_HFSPLUS_FS_H +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/fs.h> #include <linux/mutex.h> #include <linux/buffer_head.h> @@ -32,9 +38,17 @@ #endif #define DBG_MASK (0) -#define dprint(flg, fmt, args...) \ - if (flg & DBG_MASK) \ - printk(fmt , ## args) +#define hfs_dbg(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ +} while (0) + +#define hfs_dbg_cont(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + pr_cont(fmt, ##__VA_ARGS__); \ +} while (0) /* Runtime config options */ #define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */ @@ -481,11 +495,8 @@ int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *); int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, int, const char *, int); -int hfsplus_hash_dentry(const struct dentry *dentry, - const struct inode *inode, struct qstr *str); -int hfsplus_compare_dentry(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str); +int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); /* wrapper.c */ diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 160ccc9cdb4b..f833d35630ab 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> +#include <linux/aio.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" @@ -357,7 +358,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, if (!error) error = error2; } else { - printk(KERN_ERR "hfs: sync non-existent attributes tree\n"); + pr_err("sync non-existent attributes tree\n"); } } @@ -573,7 +574,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); HFSPLUS_I(inode)->create_date = file->create_date; } else { - printk(KERN_ERR "hfs: bad catalog entry used to create inode\n"); + pr_err("bad catalog entry used to create inode\n"); res = -EIO; } return res; diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index ed257c671615..968eab5bc1f5 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -113,67 +113,67 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) switch (token) { case opt_creator: if (match_fourchar(&args[0], &sbi->creator)) { - printk(KERN_ERR "hfs: creator requires a 4 character value\n"); + pr_err("creator requires a 4 character value\n"); return 0; } break; case opt_type: if (match_fourchar(&args[0], &sbi->type)) { - printk(KERN_ERR "hfs: type requires a 4 character value\n"); + pr_err("type requires a 4 character value\n"); return 0; } break; case opt_umask: if (match_octal(&args[0], &tmp)) { - printk(KERN_ERR "hfs: umask requires a value\n"); + pr_err("umask requires a value\n"); return 0; } sbi->umask = (umode_t)tmp; break; case opt_uid: if (match_int(&args[0], &tmp)) { - printk(KERN_ERR "hfs: uid requires an argument\n"); + pr_err("uid requires an argument\n"); return 0; } sbi->uid = make_kuid(current_user_ns(), (uid_t)tmp); if (!uid_valid(sbi->uid)) { - printk(KERN_ERR "hfs: invalid uid specified\n"); + pr_err("invalid uid specified\n"); return 0; } break; case opt_gid: if (match_int(&args[0], &tmp)) { - printk(KERN_ERR "hfs: gid requires an argument\n"); + pr_err("gid requires an argument\n"); return 0; } sbi->gid = make_kgid(current_user_ns(), (gid_t)tmp); if (!gid_valid(sbi->gid)) { - printk(KERN_ERR "hfs: invalid gid specified\n"); + pr_err("invalid gid specified\n"); return 0; } break; case opt_part: if (match_int(&args[0], &sbi->part)) { - printk(KERN_ERR "hfs: part requires an argument\n"); + pr_err("part requires an argument\n"); return 0; } break; case opt_session: if (match_int(&args[0], &sbi->session)) { - printk(KERN_ERR "hfs: session requires an argument\n"); + pr_err("session requires an argument\n"); return 0; } break; case opt_nls: if (sbi->nls) { - printk(KERN_ERR "hfs: unable to change nls mapping\n"); + pr_err("unable to change nls mapping\n"); return 0; } p = match_strdup(&args[0]); if (p) sbi->nls = load_nls(p); if (!sbi->nls) { - printk(KERN_ERR "hfs: unable to load " + pr_err("unable to load " "nls mapping \"%s\"\n", p); kfree(p); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 7b87284e46dc..4c4d142cf890 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -132,7 +132,7 @@ static int hfsplus_system_write_inode(struct inode *inode) if (tree) { int err = hfs_btree_write(tree); if (err) { - printk(KERN_ERR "hfs: b-tree write err: %d, ino %lu\n", + pr_err("b-tree write err: %d, ino %lu\n", err, inode->i_ino); return err; } @@ -145,7 +145,7 @@ static int hfsplus_write_inode(struct inode *inode, { int err; - dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); + hfs_dbg(INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); err = hfsplus_ext_write_extent(inode); if (err) @@ -160,7 +160,7 @@ static int hfsplus_write_inode(struct inode *inode, static void hfsplus_evict_inode(struct inode *inode) { - dprint(DBG_INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); + hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { @@ -179,7 +179,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) if (!wait) return 0; - dprint(DBG_SUPER, "hfsplus_sync_fs\n"); + hfs_dbg(SUPER, "hfsplus_sync_fs\n"); /* * Explicitly write out the special metadata inodes. @@ -251,7 +251,7 @@ static void delayed_sync_fs(struct work_struct *work) err = hfsplus_sync_fs(sbi->alloc_file->i_sb, 1); if (err) - printk(KERN_ERR "hfs: delayed sync fs err %d\n", err); + pr_err("delayed sync fs err %d\n", err); } void hfsplus_mark_mdb_dirty(struct super_block *sb) @@ -275,7 +275,7 @@ static void hfsplus_put_super(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); - dprint(DBG_SUPER, "hfsplus_put_super\n"); + hfs_dbg(SUPER, "hfsplus_put_super\n"); cancel_delayed_work_sync(&sbi->sync_work); @@ -333,25 +333,19 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data) return -EINVAL; if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { - printk(KERN_WARNING "hfs: filesystem was " - "not cleanly unmounted, " - "running fsck.hfsplus is recommended. " - "leaving read-only.\n"); + pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } else if (force) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { - printk(KERN_WARNING "hfs: filesystem is marked locked, " - "leaving read-only.\n"); + pr_warn("filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { - printk(KERN_WARNING "hfs: filesystem is " - "marked journaled, " - "leaving read-only.\n"); + pr_warn("filesystem is marked journaled, leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } @@ -397,7 +391,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; if (!hfsplus_parse_options(data, sbi)) { - printk(KERN_ERR "hfs: unable to parse mount options\n"); + pr_err("unable to parse mount options\n"); goto out_unload_nls; } @@ -405,14 +399,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) nls = sbi->nls; sbi->nls = load_nls("utf8"); if (!sbi->nls) { - printk(KERN_ERR "hfs: unable to load nls for utf8\n"); + pr_err("unable to load nls for utf8\n"); goto out_unload_nls; } /* Grab the volume header */ if (hfsplus_read_wrapper(sb)) { if (!silent) - printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n"); + pr_warn("unable to find HFS+ superblock\n"); goto out_unload_nls; } vhdr = sbi->s_vhdr; @@ -421,7 +415,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = HFSPLUS_VOLHEAD_SIG; if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION || be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) { - printk(KERN_ERR "hfs: wrong filesystem version\n"); + pr_err("wrong filesystem version\n"); goto out_free_vhdr; } sbi->total_blocks = be32_to_cpu(vhdr->total_blocks); @@ -445,7 +439,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) || (last_fs_page > (pgoff_t)(~0ULL))) { - printk(KERN_ERR "hfs: filesystem size too large.\n"); + pr_err("filesystem size too large\n"); goto out_free_vhdr; } @@ -454,22 +448,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = MAX_LFS_FILESIZE; if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { - printk(KERN_WARNING "hfs: Filesystem was " - "not cleanly unmounted, " - "running fsck.hfsplus is recommended. " - "mounting read-only.\n"); + pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { - printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); + pr_warn("Filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) { - printk(KERN_WARNING "hfs: write access to " - "a journaled filesystem is not supported, " - "use the force option at your own risk, " - "mounting read-only.\n"); + pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } @@ -478,18 +466,18 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) /* Load metadata objects (B*Trees) */ sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); if (!sbi->ext_tree) { - printk(KERN_ERR "hfs: failed to load extents file\n"); + pr_err("failed to load extents file\n"); goto out_free_vhdr; } sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); if (!sbi->cat_tree) { - printk(KERN_ERR "hfs: failed to load catalog file\n"); + pr_err("failed to load catalog file\n"); goto out_close_ext_tree; } if (vhdr->attr_file.total_blocks != 0) { sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID); if (!sbi->attr_tree) { - printk(KERN_ERR "hfs: failed to load attributes file\n"); + pr_err("failed to load attributes file\n"); goto out_close_cat_tree; } } @@ -497,7 +485,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID); if (IS_ERR(inode)) { - printk(KERN_ERR "hfs: failed to load allocation file\n"); + pr_err("failed to load allocation file\n"); err = PTR_ERR(inode); goto out_close_attr_tree; } @@ -506,7 +494,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) /* Load the root directory */ root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); if (IS_ERR(root)) { - printk(KERN_ERR "hfs: failed to load root directory\n"); + pr_err("failed to load root directory\n"); err = PTR_ERR(root); goto out_put_alloc_file; } diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index 2c2e47dcfdd8..e8ef121a4d8b 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -334,8 +334,7 @@ int hfsplus_asc2uni(struct super_block *sb, * Composed unicode characters are decomposed and case-folding is performed * if the appropriate bits are (un)set on the superblock. */ -int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode, - struct qstr *str) +int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str) { struct super_block *sb = dentry->d_sb; const char *astr; @@ -386,9 +385,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode, * Composed unicode characters are decomposed and case-folding is performed * if the appropriate bits are (un)set on the superblock. */ -int hfsplus_compare_dentry(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct super_block *sb = parent->d_sb; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 90effcccca9a..b51a6079108d 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -156,7 +156,7 @@ static int hfsplus_get_last_session(struct super_block *sb, *start = (sector_t)te.cdte_addr.lba << 2; return 0; } - printk(KERN_ERR "hfs: invalid session number or type of track\n"); + pr_err("invalid session number or type of track\n"); return -EINVAL; } ms_info.addr_format = CDROM_LBA; @@ -234,8 +234,7 @@ reread: error = -EINVAL; if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) { - printk(KERN_WARNING - "hfs: invalid secondary volume header\n"); + pr_warn("invalid secondary volume header\n"); goto out_free_backup_vhdr; } @@ -259,8 +258,7 @@ reread: blocksize >>= 1; if (sb_set_blocksize(sb, blocksize) != blocksize) { - printk(KERN_ERR "hfs: unable to set blocksize to %u!\n", - blocksize); + pr_err("unable to set blocksize to %u!\n", blocksize); goto out_free_backup_vhdr; } diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index e8a4b0815c61..f66346155df5 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -107,19 +107,19 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); if (err) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); return err; } err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); if (err) { - printk(KERN_ERR "hfs: catalog searching failed\n"); + pr_err("catalog searching failed\n"); goto end_setxattr; } if (!strcmp_xattr_finder_info(name)) { if (flags & XATTR_CREATE) { - printk(KERN_ERR "hfs: xattr exists yet\n"); + pr_err("xattr exists yet\n"); err = -EOPNOTSUPP; goto end_setxattr; } @@ -165,7 +165,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, if (hfsplus_attr_exists(inode, name)) { if (flags & XATTR_CREATE) { - printk(KERN_ERR "hfs: xattr exists yet\n"); + pr_err("xattr exists yet\n"); err = -EOPNOTSUPP; goto end_setxattr; } @@ -177,7 +177,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, goto end_setxattr; } else { if (flags & XATTR_REPLACE) { - printk(KERN_ERR "hfs: cannot replace xattr\n"); + pr_err("cannot replace xattr\n"); err = -EOPNOTSUPP; goto end_setxattr; } @@ -210,7 +210,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, cat_entry_flags); hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); } else { - printk(KERN_ERR "hfs: invalid catalog entry type\n"); + pr_err("invalid catalog entry type\n"); err = -EIO; goto end_setxattr; } @@ -269,7 +269,7 @@ static ssize_t hfsplus_getxattr_finder_info(struct dentry *dentry, if (size >= record_len) { res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); if (res) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); return res; } res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); @@ -340,13 +340,13 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, entry = hfsplus_alloc_attr_entry(); if (!entry) { - printk(KERN_ERR "hfs: can't allocate xattr entry\n"); + pr_err("can't allocate xattr entry\n"); return -ENOMEM; } res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); if (res) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); goto failed_getxattr_init; } @@ -355,7 +355,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, if (res == -ENOENT) res = -ENODATA; else - printk(KERN_ERR "hfs: xattr searching failed\n"); + pr_err("xattr searching failed\n"); goto out; } @@ -368,17 +368,17 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, offsetof(struct hfsplus_attr_inline_data, length)); if (record_length > HFSPLUS_MAX_INLINE_DATA_SIZE) { - printk(KERN_ERR "hfs: invalid xattr record size\n"); + pr_err("invalid xattr record size\n"); res = -EIO; goto out; } } else if (record_type == HFSPLUS_ATTR_FORK_DATA || record_type == HFSPLUS_ATTR_EXTENTS) { - printk(KERN_ERR "hfs: only inline data xattr are supported\n"); + pr_err("only inline data xattr are supported\n"); res = -EOPNOTSUPP; goto out; } else { - printk(KERN_ERR "hfs: invalid xattr record\n"); + pr_err("invalid xattr record\n"); res = -EIO; goto out; } @@ -427,7 +427,7 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); if (res) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); return res; } @@ -506,7 +506,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); if (err) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); return err; } @@ -525,8 +525,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) for (;;) { key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset); if (key_len == 0 || key_len > fd.tree->max_key_len) { - printk(KERN_ERR "hfs: invalid xattr key length: %d\n", - key_len); + pr_err("invalid xattr key length: %d\n", key_len); res = -EIO; goto end_listxattr; } @@ -541,7 +540,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) if (hfsplus_uni2asc(inode->i_sb, (const struct hfsplus_unistr *)&fd.key->attr.key_name, strbuf, &xattr_name_len)) { - printk(KERN_ERR "hfs: unicode conversion failed\n"); + pr_err("unicode conversion failed\n"); res = -EIO; goto end_listxattr; } @@ -598,13 +597,13 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name) err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); if (err) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); return err; } err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); if (err) { - printk(KERN_ERR "hfs: catalog searching failed\n"); + pr_err("catalog searching failed\n"); goto end_removexattr; } @@ -643,7 +642,7 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name) flags); hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); } else { - printk(KERN_ERR "hfs: invalid catalog entry type\n"); + pr_err("invalid catalog entry type\n"); err = -EIO; goto end_removexattr; } diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 0f6e52d22b84..cddb05217512 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -7,6 +7,7 @@ */ #include <linux/fs.h> +#include <linux/magic.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/pagemap.h> @@ -45,8 +46,6 @@ static const struct dentry_operations hostfs_dentry_ops = { static char *root_ino = ""; static int append = 0; -#define HOSTFS_SUPER_MAGIC 0x00c0ffee - static const struct inode_operations hostfs_iops; static const struct inode_operations hostfs_dir_iops; static const struct inode_operations hostfs_link_iops; @@ -121,7 +120,7 @@ static char *dentry_name(struct dentry *dentry) if (!name) return NULL; - return __dentry_name(dentry, name); /* will unlock */ + return __dentry_name(dentry, name); } static char *inode_name(struct inode *ino) @@ -229,10 +228,11 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) { struct hostfs_inode_info *hi; - hi = kzalloc(sizeof(*hi), GFP_KERNEL); + hi = kmalloc(sizeof(*hi), GFP_KERNEL); if (hi == NULL) return NULL; hi->fd = -1; + hi->mode = 0; inode_init_once(&hi->vfs_inode); return &hi->vfs_inode; } @@ -277,7 +277,7 @@ static const struct super_operations hostfs_sbops = { .show_options = hostfs_show_options, }; -int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) +int hostfs_readdir(struct file *file, struct dir_context *ctx) { void *dir; char *name; @@ -292,12 +292,11 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) __putname(name); if (dir == NULL) return -error; - next = file->f_pos; + next = ctx->pos; while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) { - error = (*filldir)(ent, name, len, file->f_pos, - ino, type); - if (error) break; - file->f_pos = next; + if (!dir_emit(ctx, name, len, ino, type)) + break; + ctx->pos = next; } close_dir(dir); return 0; @@ -393,7 +392,7 @@ static const struct file_operations hostfs_file_fops = { static const struct file_operations hostfs_dir_fops = { .llseek = generic_file_llseek, - .readdir = hostfs_readdir, + .iterate = hostfs_readdir, .read = generic_read_dir, }; diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c index f49d1498aa2e..4d0a1afa058c 100644 --- a/fs/hpfs/buffer.c +++ b/fs/hpfs/buffer.c @@ -7,8 +7,37 @@ */ #include <linux/sched.h> #include <linux/slab.h> +#include <linux/blkdev.h> #include "hpfs_fn.h" +void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n) +{ + struct buffer_head *bh; + struct blk_plug plug; + + if (n <= 0 || unlikely(secno >= hpfs_sb(s)->sb_fs_size)) + return; + + bh = sb_find_get_block(s, secno); + if (bh) { + if (buffer_uptodate(bh)) { + brelse(bh); + return; + } + brelse(bh); + }; + + blk_start_plug(&plug); + while (n > 0) { + if (unlikely(secno >= hpfs_sb(s)->sb_fs_size)) + break; + sb_breadahead(s, secno); + secno++; + n--; + } + blk_finish_plug(&plug); +} + /* Map a sector into a buffer and return pointers to it and to the buffer. */ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp, @@ -18,6 +47,8 @@ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head hpfs_lock_assert(s); + hpfs_prefetch_sectors(s, secno, ahead); + cond_resched(); *bhp = bh = sb_bread(s, secno); @@ -67,6 +98,8 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe return NULL; } + hpfs_prefetch_sectors(s, secno, 4 + ahead); + qbh->data = data = kmalloc(2048, GFP_NOFS); if (!data) { printk("HPFS: hpfs_map_4sectors: out of memory\n"); diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c index 05d4816e4e77..fa27980f2229 100644 --- a/fs/hpfs/dentry.c +++ b/fs/hpfs/dentry.c @@ -12,8 +12,7 @@ * Note: the dentry argument is the parent dentry. */ -static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { unsigned long hash; int i; @@ -35,9 +34,7 @@ static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *ino return 0; } -static int hpfs_compare_dentry(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int hpfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { unsigned al = len; diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 546f6d39713a..292b1acb9b81 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -33,36 +33,38 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) if (whence == SEEK_DATA || whence == SEEK_HOLE) return -EINVAL; + mutex_lock(&i->i_mutex); hpfs_lock(s); /*printk("dir lseek\n");*/ if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; - mutex_lock(&i->i_mutex); pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1; while (pos != new_off) { if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh); else goto fail; if (pos == 12) goto fail; } - mutex_unlock(&i->i_mutex); + hpfs_add_pos(i, &filp->f_pos); ok: + filp->f_pos = new_off; hpfs_unlock(s); - return filp->f_pos = new_off; -fail: mutex_unlock(&i->i_mutex); + return new_off; +fail: /*printk("illegal lseek: %016llx\n", new_off);*/ hpfs_unlock(s); + mutex_unlock(&i->i_mutex); return -ESPIPE; } -static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int hpfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); struct quad_buffer_head qbh; struct hpfs_dirent *de; int lc; - long old_pos; + loff_t next_pos; unsigned char *tempname; int c1, c2 = 0; int ret = 0; @@ -103,11 +105,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) } } lc = hpfs_sb(inode->i_sb)->sb_lowercase; - if (filp->f_pos == 12) { /* diff -r requires this (note, that diff -r */ - filp->f_pos = 13; /* also fails on msdos filesystem in 2.0) */ + if (ctx->pos == 12) { /* diff -r requires this (note, that diff -r */ + ctx->pos = 13; /* also fails on msdos filesystem in 2.0) */ goto out; } - if (filp->f_pos == 13) { + if (ctx->pos == 13) { ret = -ENOENT; goto out; } @@ -118,33 +120,34 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) accepted by filldir, but what can I do? maybe killall -9 ls helps */ if (hpfs_sb(inode->i_sb)->sb_chk) - if (hpfs_stop_cycles(inode->i_sb, filp->f_pos, &c1, &c2, "hpfs_readdir")) { + if (hpfs_stop_cycles(inode->i_sb, ctx->pos, &c1, &c2, "hpfs_readdir")) { ret = -EFSERROR; goto out; } - if (filp->f_pos == 12) + if (ctx->pos == 12) goto out; - if (filp->f_pos == 3 || filp->f_pos == 4 || filp->f_pos == 5) { - printk("HPFS: warning: pos==%d\n",(int)filp->f_pos); + if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) { + printk("HPFS: warning: pos==%d\n",(int)ctx->pos); goto out; } - if (filp->f_pos == 0) { - if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0) + if (ctx->pos == 0) { + if (!dir_emit_dot(file, ctx)) goto out; - filp->f_pos = 11; + ctx->pos = 11; } - if (filp->f_pos == 11) { - if (filldir(dirent, "..", 2, filp->f_pos, hpfs_inode->i_parent_dir, DT_DIR) < 0) + if (ctx->pos == 11) { + if (!dir_emit(ctx, "..", 2, hpfs_inode->i_parent_dir, DT_DIR)) goto out; - filp->f_pos = 1; + ctx->pos = 1; } - if (filp->f_pos == 1) { - filp->f_pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1; - hpfs_add_pos(inode, &filp->f_pos); - filp->f_version = inode->i_version; + if (ctx->pos == 1) { + ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1; + hpfs_add_pos(inode, &file->f_pos); + file->f_version = inode->i_version; } - old_pos = filp->f_pos; - if (!(de = map_pos_dirent(inode, &filp->f_pos, &qbh))) { + next_pos = ctx->pos; + if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) { + ctx->pos = next_pos; ret = -EIOERROR; goto out; } @@ -152,20 +155,21 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (hpfs_sb(inode->i_sb)->sb_chk) { if (de->first && !de->last && (de->namelen != 2 || de ->name[0] != 1 || de->name[1] != 1)) - hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", old_pos); + hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", (unsigned long)ctx->pos); if (de->last && (de->namelen != 1 || de ->name[0] != 255)) - hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", old_pos); + hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", (unsigned long)ctx->pos); } hpfs_brelse4(&qbh); + ctx->pos = next_pos; goto again; } tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3); - if (filldir(dirent, tempname, de->namelen, old_pos, le32_to_cpu(de->fnode), DT_UNKNOWN) < 0) { - filp->f_pos = old_pos; + if (!dir_emit(ctx, tempname, de->namelen, le32_to_cpu(de->fnode), DT_UNKNOWN)) { if (tempname != de->name) kfree(tempname); hpfs_brelse4(&qbh); goto out; } + ctx->pos = next_pos; if (tempname != de->name) kfree(tempname); hpfs_brelse4(&qbh); } @@ -320,7 +324,7 @@ const struct file_operations hpfs_dir_ops = { .llseek = hpfs_dir_lseek, .read = generic_read_dir, - .readdir = hpfs_readdir, + .iterate = hpfs_readdir, .release = hpfs_dir_release, .fsync = hpfs_file_fsync, }; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 9f9dbeceeee7..4e9dabcf1f4c 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -7,6 +7,7 @@ */ #include "hpfs_fn.h" +#include <linux/mpage.h> #define BLOCKS(size) (((size) + 511) >> 9) @@ -34,7 +35,7 @@ int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) * so we must ignore such errors. */ -static secno hpfs_bmap(struct inode *inode, unsigned file_secno) +static secno hpfs_bmap(struct inode *inode, unsigned file_secno, unsigned *n_secs) { struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); unsigned n, disk_secno; @@ -42,11 +43,20 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno) struct buffer_head *bh; if (BLOCKS(hpfs_i(inode)->mmu_private) <= file_secno) return 0; n = file_secno - hpfs_inode->i_file_sec; - if (n < hpfs_inode->i_n_secs) return hpfs_inode->i_disk_sec + n; + if (n < hpfs_inode->i_n_secs) { + *n_secs = hpfs_inode->i_n_secs - n; + return hpfs_inode->i_disk_sec + n; + } if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0; disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh); if (disk_secno == -1) return 0; if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0; + n = file_secno - hpfs_inode->i_file_sec; + if (n < hpfs_inode->i_n_secs) { + *n_secs = hpfs_inode->i_n_secs - n; + return hpfs_inode->i_disk_sec + n; + } + *n_secs = 1; return disk_secno; } @@ -67,10 +77,14 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he { int r; secno s; + unsigned n_secs; hpfs_lock(inode->i_sb); - s = hpfs_bmap(inode, iblock); + s = hpfs_bmap(inode, iblock, &n_secs); if (s) { + if (bh_result->b_size >> 9 < n_secs) + n_secs = bh_result->b_size >> 9; map_bh(bh_result, inode->i_sb, s); + bh_result->b_size = n_secs << 9; goto ret_0; } if (!create) goto ret_0; @@ -95,24 +109,40 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he return r; } +static int hpfs_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, hpfs_get_block); +} + static int hpfs_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page,hpfs_get_block, wbc); + return block_write_full_page(page, hpfs_get_block, wbc); } -static int hpfs_readpage(struct file *file, struct page *page) +static int hpfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block); +} + +static int hpfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - return block_read_full_page(page,hpfs_get_block); + return mpage_writepages(mapping, wbc, hpfs_get_block); } static void hpfs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; + hpfs_lock(inode->i_sb); + if (to > inode->i_size) { truncate_pagecache(inode, to, inode->i_size); hpfs_truncate(inode); } + + hpfs_unlock(inode->i_sb); } static int hpfs_write_begin(struct file *file, struct address_space *mapping, @@ -131,6 +161,24 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping, return ret; } +static int hpfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *pagep, void *fsdata) +{ + struct inode *inode = mapping->host; + int err; + err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + if (err < len) + hpfs_write_failed(mapping, pos + len); + if (!(err < 0)) { + /* make sure we write it on close, if not earlier */ + hpfs_lock(inode->i_sb); + hpfs_i(inode)->i_dirty = 1; + hpfs_unlock(inode->i_sb); + } + return err; +} + static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,hpfs_get_block); @@ -139,31 +187,19 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) const struct address_space_operations hpfs_aops = { .readpage = hpfs_readpage, .writepage = hpfs_writepage, + .readpages = hpfs_readpages, + .writepages = hpfs_writepages, .write_begin = hpfs_write_begin, - .write_end = generic_write_end, + .write_end = hpfs_write_end, .bmap = _hpfs_bmap }; -static ssize_t hpfs_file_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - ssize_t retval; - - retval = do_sync_write(file, buf, count, ppos); - if (retval > 0) { - hpfs_lock(file->f_path.dentry->d_sb); - hpfs_i(file_inode(file))->i_dirty = 1; - hpfs_unlock(file->f_path.dentry->d_sb); - } - return retval; -} - const struct file_operations hpfs_file_ops = { .llseek = generic_file_llseek, .read = do_sync_read, .aio_read = generic_file_aio_read, - .write = hpfs_file_write, + .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, .release = hpfs_file_release, diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index b7ae286646b5..1b398636e990 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -27,8 +27,9 @@ #define ALLOC_FWD_MAX 128 #define ALLOC_M 1 #define FNODE_RD_AHEAD 16 -#define ANODE_RD_AHEAD 16 -#define DNODE_RD_AHEAD 4 +#define ANODE_RD_AHEAD 0 +#define DNODE_RD_AHEAD 72 +#define COUNT_RD_AHEAD 62 #define FREE_DNODES_ADD 58 #define FREE_DNODES_DEL 29 @@ -207,6 +208,7 @@ void hpfs_remove_fnode(struct super_block *, fnode_secno fno); /* buffer.c */ +void hpfs_prefetch_sectors(struct super_block *, unsigned, int); void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int); void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **); void *hpfs_map_4sectors(struct super_block *, unsigned, struct quad_buffer_head *, int); @@ -271,6 +273,7 @@ void hpfs_evict_inode(struct inode *); __le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *); __le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *); +void hpfs_prefetch_bitmap(struct super_block *, unsigned); unsigned char *hpfs_load_code_page(struct super_block *, secno); __le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp); struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **); diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index 4acb19d78359..3aa66ae1031e 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -17,7 +17,9 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block, struct quad_buffer_head *qbh, char *id) { secno sec; - if (hpfs_sb(s)->sb_chk) if (bmp_block * 16384 > hpfs_sb(s)->sb_fs_size) { + __le32 *ret; + unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; + if (hpfs_sb(s)->sb_chk) if (bmp_block >= n_bands) { hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id); return NULL; } @@ -26,7 +28,23 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block, hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id); return NULL; } - return hpfs_map_4sectors(s, sec, qbh, 4); + ret = hpfs_map_4sectors(s, sec, qbh, 4); + if (ret) hpfs_prefetch_bitmap(s, bmp_block + 1); + return ret; +} + +void hpfs_prefetch_bitmap(struct super_block *s, unsigned bmp_block) +{ + unsigned to_prefetch, next_prefetch; + unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; + if (unlikely(bmp_block >= n_bands)) + return; + to_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]); + if (unlikely(bmp_block + 1 >= n_bands)) + next_prefetch = 0; + else + next_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block + 1]); + hpfs_prefetch_sectors(s, to_prefetch, 4 + 4 * (to_prefetch + 4 == next_prefetch)); } /* diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a0617e706957..4334cda8dba1 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -121,7 +121,7 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) unsigned long *bits; unsigned count; - bits = hpfs_map_4sectors(s, secno, &qbh, 4); + bits = hpfs_map_4sectors(s, secno, &qbh, 0); if (!bits) return 0; count = bitmap_weight(bits, 2048 * BITS_PER_BYTE); @@ -134,8 +134,13 @@ static unsigned count_bitmaps(struct super_block *s) unsigned n, count, n_bands; n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; count = 0; - for (n = 0; n < n_bands; n++) + for (n = 0; n < COUNT_RD_AHEAD; n++) { + hpfs_prefetch_bitmap(s, n); + } + for (n = 0; n < n_bands; n++) { + hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD); count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n])); + } return count; } @@ -558,7 +563,13 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) sbi->sb_cp_table = NULL; sbi->sb_c_bitmap = -1; sbi->sb_max_fwd_alloc = 0xffffff; - + + if (sbi->sb_fs_size >= 0x80000000) { + hpfs_error(s, "invalid size in superblock: %08x", + (unsigned)sbi->sb_fs_size); + goto bail4; + } + /* Load bitmap directory */ if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps)))) goto bail4; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 126d3c2e2dee..4338ff32959d 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -69,7 +69,7 @@ static char *dentry_name(struct dentry *dentry, int extra) struct dentry *parent; char *root, *name; const char *seg_name; - int len, seg_len; + int len, seg_len, root_len; len = 0; parent = dentry; @@ -81,7 +81,8 @@ static char *dentry_name(struct dentry *dentry, int extra) } root = "proc"; - len += strlen(root); + root_len = strlen(root); + len += root_len; name = kmalloc(len + extra + 1, GFP_KERNEL); if (name == NULL) return NULL; @@ -91,7 +92,7 @@ static char *dentry_name(struct dentry *dentry, int extra) while (parent->d_parent != parent) { if (is_pid(parent)) { seg_name = "pid"; - seg_len = strlen("pid"); + seg_len = strlen(seg_name); } else { seg_name = parent->d_name.name; @@ -100,10 +101,10 @@ static char *dentry_name(struct dentry *dentry, int extra) len -= seg_len + 1; name[len] = '/'; - strncpy(&name[len + 1], seg_name, seg_len); + memcpy(&name[len + 1], seg_name, seg_len); parent = parent->d_parent; } - strncpy(name, root, strlen(root)); + memcpy(name, root, root_len); return name; } @@ -436,7 +437,6 @@ static int hppfs_open(struct inode *inode, struct file *file) path.mnt = inode->i_sb->s_fs_info; path.dentry = HPPFS_I(inode)->proc_dentry; - /* XXX This isn't closed anywhere */ data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred); err = PTR_ERR(data->proc_file); if (IS_ERR(data->proc_file)) @@ -523,17 +523,28 @@ static loff_t hppfs_llseek(struct file *file, loff_t off, int where) return default_llseek(file, off, where); } +static int hppfs_release(struct inode *inode, struct file *file) +{ + struct hppfs_private *data = file->private_data; + struct file *proc_file = data->proc_file; + if (proc_file) + fput(proc_file); + kfree(data); + return 0; +} + static const struct file_operations hppfs_file_fops = { .owner = NULL, .llseek = hppfs_llseek, .read = hppfs_read, .write = hppfs_write, .open = hppfs_open, + .release = hppfs_release, }; struct hppfs_dirent { - void *vfs_dirent; - filldir_t filldir; + struct dir_context ctx; + struct dir_context *caller; struct dentry *dentry; }; @@ -545,43 +556,32 @@ static int hppfs_filldir(void *d, const char *name, int size, if (file_removed(dirent->dentry, name)) return 0; - return (*dirent->filldir)(dirent->vfs_dirent, name, size, offset, - inode, type); + dirent->caller->pos = dirent->ctx.pos; + return !dir_emit(dirent->caller, name, size, inode, type); } -static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) +static int hppfs_readdir(struct file *file, struct dir_context *ctx) { struct hppfs_private *data = file->private_data; struct file *proc_file = data->proc_file; - int (*readdir)(struct file *, void *, filldir_t); - struct hppfs_dirent dirent = ((struct hppfs_dirent) - { .vfs_dirent = ent, - .filldir = filldir, - .dentry = file->f_path.dentry - }); + struct hppfs_dirent d = { + .ctx.actor = hppfs_filldir, + .caller = ctx, + .dentry = file->f_path.dentry + }; int err; - - readdir = file_inode(proc_file)->i_fop->readdir; - - proc_file->f_pos = file->f_pos; - err = (*readdir)(proc_file, &dirent, hppfs_filldir); - file->f_pos = proc_file->f_pos; - + proc_file->f_pos = ctx->pos; + err = iterate_dir(proc_file, &d.ctx); + ctx->pos = d.ctx.pos; return err; } -static int hppfs_fsync(struct file *file, loff_t start, loff_t end, - int datasync) -{ - return filemap_write_and_wait_range(file->f_mapping, start, end); -} - static const struct file_operations hppfs_dir_fops = { .owner = NULL, - .readdir = hppfs_readdir, + .iterate = hppfs_readdir, .open = hppfs_dir_open, - .fsync = hppfs_fsync, .llseek = default_llseek, + .release = hppfs_release, }; static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 523464e62849..a3f868ae3fd4 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -909,11 +909,8 @@ static int can_do_hugetlb_shm(void) static int get_hstate_idx(int page_size_log) { - struct hstate *h; + struct hstate *h = hstate_sizelog(page_size_log); - if (!page_size_log) - return default_hstate_idx; - h = size_to_hstate(1 << page_size_log); if (!h) return -1; return h - hstates; @@ -929,9 +926,12 @@ static struct dentry_operations anon_ops = { .d_dname = hugetlb_dname }; -struct file *hugetlb_file_setup(const char *name, unsigned long addr, - size_t size, vm_flags_t acctflag, - struct user_struct **user, +/* + * Note that size should be aligned to proper hugepage size in caller side, + * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. + */ +struct file *hugetlb_file_setup(const char *name, size_t size, + vm_flags_t acctflag, struct user_struct **user, int creat_flags, int page_size_log) { struct file *file = ERR_PTR(-ENOMEM); @@ -939,8 +939,6 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, struct path path; struct super_block *sb; struct qstr quick_string; - struct hstate *hstate; - unsigned long num_pages; int hstate_idx; hstate_idx = get_hstate_idx(page_size_log); @@ -980,12 +978,10 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, if (!inode) goto out_dentry; - hstate = hstate_inode(inode); - size += addr & ~huge_page_mask(hstate); - num_pages = ALIGN(size, huge_page_size(hstate)) >> - huge_page_shift(hstate); file = ERR_PTR(-ENOMEM); - if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag)) + if (hugetlb_reserve_pages(inode, 0, + size >> huge_page_shift(hstate_inode(inode)), NULL, + acctflag)) goto out_inode; d_instantiate(path.dentry, inode); diff --git a/fs/inode.c b/fs/inode.c index a898b3d43ccf..d6dfb09c8280 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -333,8 +333,10 @@ EXPORT_SYMBOL(set_nlink); */ void inc_nlink(struct inode *inode) { - if (WARN_ON(inode->i_nlink == 0)) + if (unlikely(inode->i_nlink == 0)) { + WARN_ON(!(inode->i_state & I_LINKABLE)); atomic_long_dec(&inode->i_sb->s_remove_count); + } inode->__i_nlink++; } @@ -1803,7 +1805,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; } else if (S_ISFIFO(mode)) - inode->i_fop = &def_fifo_fops; + inode->i_fop = &pipefifo_fops; else if (S_ISSOCK(mode)) inode->i_fop = &bad_sock_fops; else diff --git a/fs/internal.h b/fs/internal.h index 4be78237d896..7c5f01cf619d 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -96,11 +96,12 @@ struct open_flags { umode_t mode; int acc_mode; int intent; + int lookup_flags; }; extern struct file *do_filp_open(int dfd, struct filename *pathname, - const struct open_flags *op, int flags); + const struct open_flags *op); extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, - const char *, const struct open_flags *, int lookup_flags); + const char *, const struct open_flags *); extern long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag); @@ -130,3 +131,15 @@ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); * read_write.c */ extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); +extern int rw_verify_area(int, struct file *, const loff_t *, size_t); + +/* + * splice.c + */ +extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); + +/* + * pipe.c + */ +extern const struct file_operations pipefifo_fops; diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index a7d5c3c3d4e6..b943cbd963bb 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -78,8 +78,8 @@ int get_acorn_filename(struct iso_directory_record *de, /* * This should _really_ be cleaned up some day.. */ -static int do_isofs_readdir(struct inode *inode, struct file *filp, - void *dirent, filldir_t filldir, +static int do_isofs_readdir(struct inode *inode, struct file *file, + struct dir_context *ctx, char *tmpname, struct iso_directory_record *tmpde) { unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); @@ -94,10 +94,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, struct iso_directory_record *de; struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb); - offset = filp->f_pos & (bufsize - 1); - block = filp->f_pos >> bufbits; + offset = ctx->pos & (bufsize - 1); + block = ctx->pos >> bufbits; - while (filp->f_pos < inode->i_size) { + while (ctx->pos < inode->i_size) { int de_len; if (!bh) { @@ -108,7 +108,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, de = (struct iso_directory_record *) (bh->b_data + offset); - de_len = *(unsigned char *) de; + de_len = *(unsigned char *)de; /* * If the length byte is zero, we should move on to the next @@ -119,8 +119,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, if (de_len == 0) { brelse(bh); bh = NULL; - filp->f_pos = (filp->f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1); - block = filp->f_pos >> bufbits; + ctx->pos = (ctx->pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1); + block = ctx->pos >> bufbits; offset = 0; continue; } @@ -164,16 +164,16 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, if (de->flags[-sbi->s_high_sierra] & 0x80) { first_de = 0; - filp->f_pos += de_len; + ctx->pos += de_len; continue; } first_de = 1; /* Handle the case of the '.' directory */ if (de->name_len[0] == 1 && de->name[0] == 0) { - if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0) + if (!dir_emit_dot(file, ctx)) break; - filp->f_pos += de_len; + ctx->pos += de_len; continue; } @@ -181,10 +181,9 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, /* Handle the case of the '..' directory */ if (de->name_len[0] == 1 && de->name[0] == 1) { - inode_number = parent_ino(filp->f_path.dentry); - if (filldir(dirent, "..", 2, filp->f_pos, inode_number, DT_DIR) < 0) + if (!dir_emit_dotdot(file, ctx)) break; - filp->f_pos += de_len; + ctx->pos += de_len; continue; } @@ -198,7 +197,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) || (!sbi->s_showassoc && (de->flags[-sbi->s_high_sierra] & 4))) { - filp->f_pos += de_len; + ctx->pos += de_len; continue; } @@ -230,10 +229,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, } } if (len > 0) { - if (filldir(dirent, p, len, filp->f_pos, inode_number, DT_UNKNOWN) < 0) + if (!dir_emit(ctx, p, len, inode_number, DT_UNKNOWN)) break; } - filp->f_pos += de_len; + ctx->pos += de_len; continue; } @@ -247,13 +246,12 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, * handling split directory entries.. The real work is done by * "do_isofs_readdir()". */ -static int isofs_readdir(struct file *filp, - void *dirent, filldir_t filldir) +static int isofs_readdir(struct file *file, struct dir_context *ctx) { int result; char *tmpname; struct iso_directory_record *tmpde; - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); tmpname = (char *)__get_free_page(GFP_KERNEL); if (tmpname == NULL) @@ -261,7 +259,7 @@ static int isofs_readdir(struct file *filp, tmpde = (struct iso_directory_record *) (tmpname+1024); - result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde); + result = do_isofs_readdir(inode, file, ctx, tmpname, tmpde); free_page((unsigned long) tmpname); return result; @@ -271,7 +269,7 @@ const struct file_operations isofs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = isofs_readdir, + .iterate = isofs_readdir, }; /* diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index d9b8aebdeb22..c348d6d88624 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -28,31 +28,23 @@ #define BEQUIET -static int isofs_hashi(const struct dentry *parent, const struct inode *inode, - struct qstr *qstr); -static int isofs_hash(const struct dentry *parent, const struct inode *inode, - struct qstr *qstr); +static int isofs_hashi(const struct dentry *parent, struct qstr *qstr); +static int isofs_hash(const struct dentry *parent, struct qstr *qstr); static int isofs_dentry_cmpi(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, + const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); static int isofs_dentry_cmp(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, + const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); #ifdef CONFIG_JOLIET -static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode, - struct qstr *qstr); -static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode, - struct qstr *qstr); +static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr); +static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr); static int isofs_dentry_cmpi_ms(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, + const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); static int isofs_dentry_cmp_ms(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, + const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); #endif @@ -265,30 +257,26 @@ static int isofs_dentry_cmp_common( } static int -isofs_hash(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +isofs_hash(const struct dentry *dentry, struct qstr *qstr) { return isofs_hash_common(dentry, qstr, 0); } static int -isofs_hashi(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +isofs_hashi(const struct dentry *dentry, struct qstr *qstr) { return isofs_hashi_common(dentry, qstr, 0); } static int -isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +isofs_dentry_cmp(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 0, 0); } static int -isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 0, 1); @@ -296,30 +284,26 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode, #ifdef CONFIG_JOLIET static int -isofs_hash_ms(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr) { return isofs_hash_common(dentry, qstr, 1); } static int -isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr) { return isofs_hashi_common(dentry, qstr, 1); } static int -isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +isofs_dentry_cmp_ms(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 1, 0); } static int -isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +isofs_dentry_cmpi_ms(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 1, 1); diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index c167028844ed..95295640d9c8 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -37,8 +37,7 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen) qstr.name = compare; qstr.len = dlen; - return dentry->d_op->d_compare(NULL, NULL, NULL, NULL, - dentry->d_name.len, dentry->d_name.name, &qstr); + return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr); } /* diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 86b39b167c23..11bb11f48b3a 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -162,8 +162,17 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs, for (i = 0; i < bufs; i++) { wbuf[i]->b_end_io = end_buffer_write_sync; - /* We use-up our safety reference in submit_bh() */ - submit_bh(write_op, wbuf[i]); + /* + * Here we write back pagecache data that may be mmaped. Since + * we cannot afford to clean the page and set PageWriteback + * here due to lock ordering (page lock ranks above transaction + * start), the data can change while IO is in flight. Tell the + * block layer it should bounce the bio pages if stable data + * during write is required. + * + * We use up our safety reference in submit_bh(). + */ + _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE); } } @@ -667,7 +676,17 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(write_op, bh); + /* + * In data=journal mode, here we can end up + * writing pagecache data that might be + * mmapped. Since we can't afford to clean the + * page and set PageWriteback (see the comment + * near the other use of _submit_bh()), the + * data can change while the write is in + * flight. Tell the block layer to bounce the + * bio pages if stable pages are required. + */ + _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE); } cond_resched(); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 81cc7eaff863..6510d6355729 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -310,8 +310,6 @@ int journal_write_metadata_buffer(transaction_t *transaction, new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); /* keep subsequent assertions sane */ - new_bh->b_state = 0; - init_buffer(new_bh, NULL, NULL); atomic_set(&new_bh->b_count, 1); new_jh = journal_add_journal_head(new_bh); /* This sleeps */ @@ -564,6 +562,16 @@ int log_wait_commit(journal_t *journal, tid_t tid) spin_unlock(&journal->j_state_lock); #endif spin_lock(&journal->j_state_lock); + /* + * Not running or committing trans? Must be already committed. This + * saves us from waiting for a *long* time when tid overflows. + */ + if (!((journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) || + (journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid))) + goto out_unlock; + if (!tid_geq(journal->j_commit_waited, tid)) journal->j_commit_waited = tid; while (tid_gt(tid, journal->j_commit_sequence)) { @@ -575,6 +583,7 @@ int log_wait_commit(journal_t *journal, tid_t tid) !tid_gt(tid, journal->j_commit_sequence)); spin_lock(&journal->j_state_lock); } +out_unlock: spin_unlock(&journal->j_state_lock); if (unlikely(is_journal_aborted(journal))) { @@ -1845,7 +1854,7 @@ static struct journal_head *journal_alloc_journal_head(void) #ifdef CONFIG_JBD_DEBUG atomic_inc(&nr_journal_heads); #endif - ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); if (ret == NULL) { jbd_debug(1, "out of memory for journal_head\n"); printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n", @@ -1853,7 +1862,7 @@ static struct journal_head *journal_alloc_journal_head(void) while (ret == NULL) { yield(); - ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); } } return ret; @@ -1915,10 +1924,8 @@ struct journal_head *journal_add_journal_head(struct buffer_head *bh) struct journal_head *new_jh = NULL; repeat: - if (!buffer_jbd(bh)) { + if (!buffer_jbd(bh)) new_jh = journal_alloc_journal_head(); - memset(new_jh, 0, sizeof(*new_jh)); - } jbd_lock_bh_journal_head(bh); if (buffer_jbd(bh)) { diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 071d6905f0dd..be0c39b66fe0 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -245,7 +245,6 @@ static handle_t *new_handle(int nblocks) handle_t *handle = jbd_alloc_handle(GFP_NOFS); if (!handle) return NULL; - memset(handle, 0, sizeof(*handle)); handle->h_buffer_credits = nblocks; handle->h_ref = 1; @@ -2020,16 +2019,20 @@ zap_buffer_unlocked: * void journal_invalidatepage() - invalidate a journal page * @journal: journal to use for flush * @page: page to flush - * @offset: length of page to invalidate. + * @offset: offset of the range to invalidate + * @length: length of the range to invalidate * - * Reap page buffers containing data after offset in page. + * Reap page buffers containing data in specified range in page. */ void journal_invalidatepage(journal_t *journal, struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { struct buffer_head *head, *bh, *next; + unsigned int stop = offset + length; unsigned int curr_off = 0; + int partial_page = (offset || length < PAGE_CACHE_SIZE); int may_free = 1; if (!PageLocked(page)) @@ -2037,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal, if (!page_has_buffers(page)) return; + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + /* We will potentially be playing with lists other than just the * data lists (especially for journaled data mode), so be * cautious in our locking. */ @@ -2046,11 +2051,14 @@ void journal_invalidatepage(journal_t *journal, unsigned int next_off = curr_off + bh->b_size; next = bh->b_this_page; + if (next_off > stop) + return; + if (offset <= curr_off) { /* This block is wholly outside the truncation point */ lock_buffer(bh); may_free &= journal_unmap_buffer(journal, bh, - offset > 0); + partial_page); unlock_buffer(bh); } curr_off = next_off; @@ -2058,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal, } while (bh != head); - if (!offset) { + if (!partial_page) { if (may_free && try_to_free_buffers(page)) J_ASSERT(!page_has_buffers(page)); } diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig index 69a48c2944da..5a9f5534d57b 100644 --- a/fs/jbd2/Kconfig +++ b/fs/jbd2/Kconfig @@ -20,7 +20,7 @@ config JBD2 config JBD2_DEBUG bool "JBD2 (ext4) debugging support" - depends on JBD2 && DEBUG_FS + depends on JBD2 help If you are using the ext4 journaled file system (or potentially any other filesystem/device using JBD2), this option @@ -29,7 +29,7 @@ config JBD2_DEBUG By default, the debugging output will be turned off. If you select Y here, then you will be able to turn on debugging - with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a + with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a number between 1 and 5. The higher the number, the more debugging output is generated. To turn debugging off again, do - "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug". + "echo 0 > /sys/module/jbd2/parameters/jbd2_debug". diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index c78841ee81cf..7f34f4716165 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -120,8 +120,8 @@ void __jbd2_log_wait_for_space(journal_t *journal) int nblocks, space_left; /* assert_spin_locked(&journal->j_state_lock); */ - nblocks = jbd_space_needed(journal); - while (__jbd2_log_space_left(journal) < nblocks) { + nblocks = jbd2_space_needed(journal); + while (jbd2_log_space_left(journal) < nblocks) { if (journal->j_flags & JBD2_ABORT) return; write_unlock(&journal->j_state_lock); @@ -140,8 +140,8 @@ void __jbd2_log_wait_for_space(journal_t *journal) */ write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); - nblocks = jbd_space_needed(journal); - space_left = __jbd2_log_space_left(journal); + nblocks = jbd2_space_needed(journal); + space_left = jbd2_log_space_left(journal); if (space_left < nblocks) { int chkpt = journal->j_checkpoint_transactions != NULL; tid_t tid = 0; @@ -156,7 +156,15 @@ void __jbd2_log_wait_for_space(journal_t *journal) /* We were able to recover space; yay! */ ; } else if (tid) { + /* + * jbd2_journal_commit_transaction() may want + * to take the checkpoint_mutex if JBD2_FLUSHED + * is set. So we need to temporarily drop it. + */ + mutex_unlock(&journal->j_checkpoint_mutex); jbd2_log_wait_commit(journal, tid); + write_lock(&journal->j_state_lock); + continue; } else { printk(KERN_ERR "%s: needed %d blocks and " "only had %d space available\n", @@ -625,10 +633,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) __jbd2_journal_drop_transaction(journal, transaction); jbd2_journal_free_transaction(transaction); - - /* Just in case anybody was waiting for more transactions to be - checkpointed... */ - wake_up(&journal->j_wait_logspace); ret = 1; out: return ret; @@ -690,9 +694,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(transaction->t_state == T_FINISHED); J_ASSERT(transaction->t_buffers == NULL); J_ASSERT(transaction->t_forget == NULL); - J_ASSERT(transaction->t_iobuf_list == NULL); J_ASSERT(transaction->t_shadow_list == NULL); - J_ASSERT(transaction->t_log_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); J_ASSERT(transaction->t_checkpoint_io_list == NULL); J_ASSERT(atomic_read(&transaction->t_updates) == 0); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 750c70148eff..559bec1a37b4 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -30,15 +30,22 @@ #include <trace/events/jbd2.h> /* - * Default IO end handler for temporary BJ_IO buffer_heads. + * IO end handler for temporary buffer_heads handling writes to the journal. */ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { + struct buffer_head *orig_bh = bh->b_private; + BUFFER_TRACE(bh, ""); if (uptodate) set_buffer_uptodate(bh); else clear_buffer_uptodate(bh); + if (orig_bh) { + clear_bit_unlock(BH_Shadow, &orig_bh->b_state); + smp_mb__after_clear_bit(); + wake_up_bit(&orig_bh->b_state, BH_Shadow); + } unlock_buffer(bh); } @@ -85,8 +92,7 @@ nope: __brelse(bh); } -static void jbd2_commit_block_csum_set(journal_t *j, - struct journal_head *descriptor) +static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) { struct commit_header *h; __u32 csum; @@ -94,12 +100,11 @@ static void jbd2_commit_block_csum_set(journal_t *j, if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return; - h = (struct commit_header *)(jh2bh(descriptor)->b_data); + h = (struct commit_header *)(bh->b_data); h->h_chksum_type = 0; h->h_chksum_size = 0; h->h_chksum[0] = 0; - csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, - j->j_blocksize); + csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); h->h_chksum[0] = cpu_to_be32(csum); } @@ -116,7 +121,6 @@ static int journal_submit_commit_record(journal_t *journal, struct buffer_head **cbh, __u32 crc32_sum) { - struct journal_head *descriptor; struct commit_header *tmp; struct buffer_head *bh; int ret; @@ -127,12 +131,10 @@ static int journal_submit_commit_record(journal_t *journal, if (is_journal_aborted(journal)) return 0; - descriptor = jbd2_journal_get_descriptor_buffer(journal); - if (!descriptor) + bh = jbd2_journal_get_descriptor_buffer(journal); + if (!bh) return 1; - bh = jh2bh(descriptor); - tmp = (struct commit_header *)bh->b_data; tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); @@ -146,9 +148,9 @@ static int journal_submit_commit_record(journal_t *journal, tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; tmp->h_chksum[0] = cpu_to_be32(crc32_sum); } - jbd2_commit_block_csum_set(journal, descriptor); + jbd2_commit_block_csum_set(journal, bh); - JBUFFER_TRACE(descriptor, "submit commit block"); + BUFFER_TRACE(bh, "submit commit block"); lock_buffer(bh); clear_buffer_dirty(bh); set_buffer_uptodate(bh); @@ -180,7 +182,6 @@ static int journal_wait_on_commit_record(journal_t *journal, if (unlikely(!buffer_uptodate(bh))) ret = -EIO; put_bh(bh); /* One for getblk() */ - jbd2_journal_put_journal_head(bh2jh(bh)); return ret; } @@ -321,7 +322,7 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, } static void jbd2_descr_block_csum_set(journal_t *j, - struct journal_head *descriptor) + struct buffer_head *bh) { struct jbd2_journal_block_tail *tail; __u32 csum; @@ -329,12 +330,10 @@ static void jbd2_descr_block_csum_set(journal_t *j, if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return; - tail = (struct jbd2_journal_block_tail *) - (jh2bh(descriptor)->b_data + j->j_blocksize - + tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); tail->t_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, - j->j_blocksize); + csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); tail->t_checksum = cpu_to_be32(csum); } @@ -343,20 +342,21 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, { struct page *page = bh->b_page; __u8 *addr; - __u32 csum; + __u32 csum32; if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return; sequence = cpu_to_be32(sequence); addr = kmap_atomic(page); - csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, - sizeof(sequence)); - csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data), - bh->b_size); + csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, + sizeof(sequence)); + csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data), + bh->b_size); kunmap_atomic(addr); - tag->t_checksum = cpu_to_be32(csum); + /* We only have space to store the lower 16 bits of the crc32c. */ + tag->t_checksum = cpu_to_be16(csum32); } /* * jbd2_journal_commit_transaction @@ -368,7 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) { struct transaction_stats_s stats; transaction_t *commit_transaction; - struct journal_head *jh, *new_jh, *descriptor; + struct journal_head *jh; + struct buffer_head *descriptor; struct buffer_head **wbuf = journal->j_wbuf; int bufs; int flags; @@ -382,7 +383,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) int space_left = 0; int first_tag = 0; int tag_flag; - int i, to_free = 0; + int i; int tag_bytes = journal_tag_bytes(journal); struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; @@ -392,6 +393,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) tid_t first_tid; int update_tail; int csum_size = 0; + LIST_HEAD(io_bufs); + LIST_HEAD(log_bufs); if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) csum_size = sizeof(struct jbd2_journal_block_tail); @@ -424,13 +427,13 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT(journal->j_committing_transaction == NULL); commit_transaction = journal->j_running_transaction; - J_ASSERT(commit_transaction->t_state == T_RUNNING); trace_jbd2_start_commit(journal, commit_transaction); jbd_debug(1, "JBD2: starting commit of transaction %d\n", commit_transaction->t_tid); write_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_RUNNING); commit_transaction->t_state = T_LOCKED; trace_jbd2_commit_locking(journal, commit_transaction); @@ -520,6 +523,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ jbd2_journal_switch_revoke_table(journal); + /* + * Reserved credits cannot be claimed anymore, free them + */ + atomic_sub(atomic_read(&journal->j_reserved_credits), + &commit_transaction->t_outstanding_credits); + trace_jbd2_commit_flushing(journal, commit_transaction); stats.run.rs_flushing = jiffies; stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, @@ -533,7 +542,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) wake_up(&journal->j_wait_transaction_locked); write_unlock(&journal->j_state_lock); - jbd_debug(3, "JBD2: commit phase 2\n"); + jbd_debug(3, "JBD2: commit phase 2a\n"); /* * Now start flushing things to disk, in the order they appear @@ -545,10 +554,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) blk_start_plug(&plug); jbd2_journal_write_revoke_records(journal, commit_transaction, - WRITE_SYNC); + &log_bufs, WRITE_SYNC); blk_finish_plug(&plug); - jbd_debug(3, "JBD2: commit phase 2\n"); + jbd_debug(3, "JBD2: commit phase 2b\n"); /* * Way to go: we have now written out all of the data for a @@ -571,8 +580,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) atomic_read(&commit_transaction->t_outstanding_credits)); err = 0; - descriptor = NULL; bufs = 0; + descriptor = NULL; blk_start_plug(&plug); while (commit_transaction->t_buffers) { @@ -604,8 +613,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) record the metadata buffer. */ if (!descriptor) { - struct buffer_head *bh; - J_ASSERT (bufs == 0); jbd_debug(4, "JBD2: get descriptor\n"); @@ -616,26 +623,26 @@ void jbd2_journal_commit_transaction(journal_t *journal) continue; } - bh = jh2bh(descriptor); jbd_debug(4, "JBD2: got buffer %llu (%p)\n", - (unsigned long long)bh->b_blocknr, bh->b_data); - header = (journal_header_t *)&bh->b_data[0]; + (unsigned long long)descriptor->b_blocknr, + descriptor->b_data); + header = (journal_header_t *)descriptor->b_data; header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); header->h_sequence = cpu_to_be32(commit_transaction->t_tid); - tagp = &bh->b_data[sizeof(journal_header_t)]; - space_left = bh->b_size - sizeof(journal_header_t); + tagp = &descriptor->b_data[sizeof(journal_header_t)]; + space_left = descriptor->b_size - + sizeof(journal_header_t); first_tag = 1; - set_buffer_jwrite(bh); - set_buffer_dirty(bh); - wbuf[bufs++] = bh; + set_buffer_jwrite(descriptor); + set_buffer_dirty(descriptor); + wbuf[bufs++] = descriptor; /* Record it so that we can wait for IO completion later */ - BUFFER_TRACE(bh, "ph3: file as descriptor"); - jbd2_journal_file_buffer(descriptor, commit_transaction, - BJ_LogCtl); + BUFFER_TRACE(descriptor, "ph3: file as descriptor"); + jbd2_file_log_bh(&log_bufs, descriptor); } /* Where is the buffer to be written? */ @@ -658,29 +665,22 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* Bump b_count to prevent truncate from stumbling over the shadowed buffer! @@@ This can go if we ever get - rid of the BJ_IO/BJ_Shadow pairing of buffers. */ + rid of the shadow pairing of buffers. */ atomic_inc(&jh2bh(jh)->b_count); - /* Make a temporary IO buffer with which to write it out - (this will requeue both the metadata buffer and the - temporary IO buffer). new_bh goes on BJ_IO*/ - - set_bit(BH_JWrite, &jh2bh(jh)->b_state); /* - * akpm: jbd2_journal_write_metadata_buffer() sets - * new_bh->b_transaction to commit_transaction. - * We need to clean this up before we release new_bh - * (which is of type BJ_IO) + * Make a temporary IO buffer with which to write it out + * (this will requeue the metadata buffer to BJ_Shadow). */ + set_bit(BH_JWrite, &jh2bh(jh)->b_state); JBUFFER_TRACE(jh, "ph3: write metadata"); flags = jbd2_journal_write_metadata_buffer(commit_transaction, - jh, &new_jh, blocknr); + jh, &wbuf[bufs], blocknr); if (flags < 0) { jbd2_journal_abort(journal, flags); continue; } - set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); - wbuf[bufs++] = jh2bh(new_jh); + jbd2_file_log_bh(&io_bufs, wbuf[bufs]); /* Record the new block's tag in the current descriptor buffer */ @@ -694,10 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) tag = (journal_block_tag_t *) tagp; write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); tag->t_flags = cpu_to_be16(tag_flag); - jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh), + jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], commit_transaction->t_tid); tagp += tag_bytes; space_left -= tag_bytes; + bufs++; if (first_tag) { memcpy (tagp, journal->j_uuid, 16); @@ -809,7 +810,7 @@ start_journal_io: the log. Before we can commit it, wait for the IO so far to complete. Control buffers being written are on the transaction's t_log_list queue, and metadata buffers are on - the t_iobuf_list queue. + the io_bufs list. Wait for the buffers in reverse order. That way we are less likely to be woken up until all IOs have completed, and @@ -818,47 +819,33 @@ start_journal_io: jbd_debug(3, "JBD2: commit phase 3\n"); - /* - * akpm: these are BJ_IO, and j_list_lock is not needed. - * See __journal_try_to_free_buffer. - */ -wait_for_iobuf: - while (commit_transaction->t_iobuf_list != NULL) { - struct buffer_head *bh; + while (!list_empty(&io_bufs)) { + struct buffer_head *bh = list_entry(io_bufs.prev, + struct buffer_head, + b_assoc_buffers); - jh = commit_transaction->t_iobuf_list->b_tprev; - bh = jh2bh(jh); - if (buffer_locked(bh)) { - wait_on_buffer(bh); - goto wait_for_iobuf; - } - if (cond_resched()) - goto wait_for_iobuf; + wait_on_buffer(bh); + cond_resched(); if (unlikely(!buffer_uptodate(bh))) err = -EIO; - - clear_buffer_jwrite(bh); - - JBUFFER_TRACE(jh, "ph4: unfile after journal write"); - jbd2_journal_unfile_buffer(journal, jh); + jbd2_unfile_log_bh(bh); /* - * ->t_iobuf_list should contain only dummy buffer_heads - * which were created by jbd2_journal_write_metadata_buffer(). + * The list contains temporary buffer heads created by + * jbd2_journal_write_metadata_buffer(). */ BUFFER_TRACE(bh, "dumping temporary bh"); - jbd2_journal_put_journal_head(jh); __brelse(bh); J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); free_buffer_head(bh); - /* We also have to unlock and free the corresponding - shadowed buffer */ + /* We also have to refile the corresponding shadowed buffer */ jh = commit_transaction->t_shadow_list->b_tprev; bh = jh2bh(jh); - clear_bit(BH_JWrite, &bh->b_state); + clear_buffer_jwrite(bh); J_ASSERT_BH(bh, buffer_jbddirty(bh)); + J_ASSERT_BH(bh, !buffer_shadow(bh)); /* The metadata is now released for reuse, but we need to remember it against this transaction so that when @@ -866,14 +853,6 @@ wait_for_iobuf: required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); - /* - * Wake up any transactions which were waiting for this IO to - * complete. The barrier must be here so that changes by - * jbd2_journal_file_buffer() take effect before wake_up_bit() - * does the waitqueue check. - */ - smp_mb(); - wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); } @@ -883,26 +862,19 @@ wait_for_iobuf: jbd_debug(3, "JBD2: commit phase 4\n"); /* Here we wait for the revoke record and descriptor record buffers */ - wait_for_ctlbuf: - while (commit_transaction->t_log_list != NULL) { + while (!list_empty(&log_bufs)) { struct buffer_head *bh; - jh = commit_transaction->t_log_list->b_tprev; - bh = jh2bh(jh); - if (buffer_locked(bh)) { - wait_on_buffer(bh); - goto wait_for_ctlbuf; - } - if (cond_resched()) - goto wait_for_ctlbuf; + bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers); + wait_on_buffer(bh); + cond_resched(); if (unlikely(!buffer_uptodate(bh))) err = -EIO; BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); clear_buffer_jwrite(bh); - jbd2_journal_unfile_buffer(journal, jh); - jbd2_journal_put_journal_head(jh); + jbd2_unfile_log_bh(bh); __brelse(bh); /* One for getblk */ /* AKPM: bforget here */ } @@ -952,9 +924,7 @@ wait_for_iobuf: J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); J_ASSERT(commit_transaction->t_checkpoint_list == NULL); - J_ASSERT(commit_transaction->t_iobuf_list == NULL); J_ASSERT(commit_transaction->t_shadow_list == NULL); - J_ASSERT(commit_transaction->t_log_list == NULL); restart_loop: /* @@ -1134,7 +1104,7 @@ restart_loop: journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; spin_unlock(&journal->j_history_lock); - commit_transaction->t_state = T_FINISHED; + commit_transaction->t_state = T_COMMIT_CALLBACK; J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; journal->j_committing_transaction = NULL; @@ -1149,38 +1119,44 @@ restart_loop: journal->j_average_commit_time*3) / 4; else journal->j_average_commit_time = commit_time; + write_unlock(&journal->j_state_lock); - if (commit_transaction->t_checkpoint_list == NULL && - commit_transaction->t_checkpoint_io_list == NULL) { - __jbd2_journal_drop_transaction(journal, commit_transaction); - to_free = 1; + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; } else { - if (journal->j_checkpoint_transactions == NULL) { - journal->j_checkpoint_transactions = commit_transaction; - commit_transaction->t_cpnext = commit_transaction; - commit_transaction->t_cpprev = commit_transaction; - } else { - commit_transaction->t_cpnext = - journal->j_checkpoint_transactions; - commit_transaction->t_cpprev = - commit_transaction->t_cpnext->t_cpprev; - commit_transaction->t_cpnext->t_cpprev = - commit_transaction; - commit_transaction->t_cpprev->t_cpnext = + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = commit_transaction; - } } spin_unlock(&journal->j_list_lock); - + /* Drop all spin_locks because commit_callback may be block. + * __journal_remove_checkpoint() can not destroy transaction + * under us because it is not marked as T_FINISHED yet */ if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); trace_jbd2_end_commit(journal, commit_transaction); jbd_debug(1, "JBD2: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); - if (to_free) - jbd2_journal_free_transaction(commit_transaction); + write_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + commit_transaction->t_state = T_FINISHED; + /* Recheck checkpoint lists after j_list_lock was dropped */ + if (commit_transaction->t_checkpoint_list == NULL && + commit_transaction->t_checkpoint_io_list == NULL) { + __jbd2_journal_drop_transaction(journal, commit_transaction); + jbd2_journal_free_transaction(commit_transaction); + } + spin_unlock(&journal->j_list_lock); + write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_done_commit); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ed10991ab006..02c7ad9d7a41 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -103,6 +103,24 @@ EXPORT_SYMBOL(jbd2_inode_cache); static void __journal_abort_soft (journal_t *journal, int errno); static int jbd2_journal_create_slab(size_t slab_size); +#ifdef CONFIG_JBD2_DEBUG +void __jbd2_debug(int level, const char *file, const char *func, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (level > jbd2_journal_enable_debug) + return; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); + va_end(args); +} +EXPORT_SYMBOL(__jbd2_debug); +#endif + /* Checksumming functions */ int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) { @@ -310,14 +328,12 @@ static void journal_kill_thread(journal_t *journal) * * If the source buffer has already been modified by a new transaction * since we took the last commit snapshot, we use the frozen copy of - * that data for IO. If we end up using the existing buffer_head's data - * for the write, then we *have* to lock the buffer to prevent anyone - * else from using and possibly modifying it while the IO is in - * progress. - * - * The function returns a pointer to the buffer_heads to be used for IO. + * that data for IO. If we end up using the existing buffer_head's data + * for the write, then we have to make sure nobody modifies it while the + * IO is in progress. do_get_write_access() handles this. * - * We assume that the journal has already been locked in this function. + * The function returns a pointer to the buffer_head to be used for IO. + * * * Return value: * <0: Error @@ -330,15 +346,14 @@ static void journal_kill_thread(journal_t *journal) int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, - struct journal_head **jh_out, - unsigned long long blocknr) + struct buffer_head **bh_out, + sector_t blocknr) { int need_copy_out = 0; int done_copy_out = 0; int do_escape = 0; char *mapped_data; struct buffer_head *new_bh; - struct journal_head *new_jh; struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); @@ -367,17 +382,14 @@ retry_alloc: } /* keep subsequent assertions sane */ - new_bh->b_state = 0; - init_buffer(new_bh, NULL, NULL); atomic_set(&new_bh->b_count, 1); - new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */ + jbd_lock_bh_state(bh_in); +repeat: /* * If a new transaction has already done a buffer copy-out, then * we use that version of the data for the commit. */ - jbd_lock_bh_state(bh_in); -repeat: if (jh_in->b_frozen_data) { done_copy_out = 1; new_page = virt_to_page(jh_in->b_frozen_data); @@ -417,7 +429,7 @@ repeat: jbd_unlock_bh_state(bh_in); tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); if (!tmp) { - jbd2_journal_put_journal_head(new_jh); + brelse(new_bh); return -ENOMEM; } jbd_lock_bh_state(bh_in); @@ -428,7 +440,7 @@ repeat: jh_in->b_frozen_data = tmp; mapped_data = kmap_atomic(new_page); - memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); + memcpy(tmp, mapped_data + new_offset, bh_in->b_size); kunmap_atomic(mapped_data); new_page = virt_to_page(tmp); @@ -454,14 +466,14 @@ repeat: } set_bh_page(new_bh, new_page, new_offset); - new_jh->b_transaction = NULL; - new_bh->b_size = jh2bh(jh_in)->b_size; - new_bh->b_bdev = transaction->t_journal->j_dev; + new_bh->b_size = bh_in->b_size; + new_bh->b_bdev = journal->j_dev; new_bh->b_blocknr = blocknr; + new_bh->b_private = bh_in; set_buffer_mapped(new_bh); set_buffer_dirty(new_bh); - *jh_out = new_jh; + *bh_out = new_bh; /* * The to-be-written buffer needs to get moved to the io queue, @@ -472,11 +484,9 @@ repeat: spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); spin_unlock(&journal->j_list_lock); + set_buffer_shadow(bh_in); jbd_unlock_bh_state(bh_in); - JBUFFER_TRACE(new_jh, "file as BJ_IO"); - jbd2_journal_file_buffer(new_jh, transaction, BJ_IO); - return do_escape | (done_copy_out << 1); } @@ -486,35 +496,6 @@ repeat: */ /* - * __jbd2_log_space_left: Return the number of free blocks left in the journal. - * - * Called with the journal already locked. - * - * Called under j_state_lock - */ - -int __jbd2_log_space_left(journal_t *journal) -{ - int left = journal->j_free; - - /* assert_spin_locked(&journal->j_state_lock); */ - - /* - * Be pessimistic here about the number of those free blocks which - * might be required for log descriptor control blocks. - */ - -#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ - - left -= MIN_LOG_RESERVED_BLOCKS; - - if (left <= 0) - return 0; - left -= (left >> 3); - return left; -} - -/* * Called with j_state_lock locked for writing. * Returns true if a transaction commit was started. */ @@ -566,20 +547,17 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid) } /* - * Force and wait upon a commit if the calling process is not within - * transaction. This is used for forcing out undo-protected data which contains - * bitmaps, when the fs is running out of space. - * - * We can only force the running transaction if we don't have an active handle; - * otherwise, we will deadlock. - * - * Returns true if a transaction was started. + * Force and wait any uncommitted transactions. We can only force the running + * transaction if we don't have an active handle, otherwise, we will deadlock. + * Returns: <0 in case of error, + * 0 if nothing to commit, + * 1 if transaction was successfully committed. */ -int jbd2_journal_force_commit_nested(journal_t *journal) +static int __jbd2_journal_force_commit(journal_t *journal) { transaction_t *transaction = NULL; tid_t tid; - int need_to_start = 0; + int need_to_start = 0, ret = 0; read_lock(&journal->j_state_lock); if (journal->j_running_transaction && !current->journal_info) { @@ -590,16 +568,53 @@ int jbd2_journal_force_commit_nested(journal_t *journal) transaction = journal->j_committing_transaction; if (!transaction) { + /* Nothing to commit */ read_unlock(&journal->j_state_lock); - return 0; /* Nothing to retry */ + return 0; } - tid = transaction->t_tid; read_unlock(&journal->j_state_lock); if (need_to_start) jbd2_log_start_commit(journal, tid); - jbd2_log_wait_commit(journal, tid); - return 1; + ret = jbd2_log_wait_commit(journal, tid); + if (!ret) + ret = 1; + + return ret; +} + +/** + * Force and wait upon a commit if the calling process is not within + * transaction. This is used for forcing out undo-protected data which contains + * bitmaps, when the fs is running out of space. + * + * @journal: journal to force + * Returns true if progress was made. + */ +int jbd2_journal_force_commit_nested(journal_t *journal) +{ + int ret; + + ret = __jbd2_journal_force_commit(journal); + return ret > 0; +} + +/** + * int journal_force_commit() - force any uncommitted transactions + * @journal: journal to force + * + * Caller want unconditional commit. We can only force the running transaction + * if we don't have an active handle, otherwise, we will deadlock. + */ +int jbd2_journal_force_commit(journal_t *journal) +{ + int ret; + + J_ASSERT(!current->journal_info); + ret = __jbd2_journal_force_commit(journal); + if (ret > 0) + ret = 0; + return ret; } /* @@ -710,6 +725,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) } /* + * When this function returns the transaction corresponding to tid + * will be completed. If the transaction has currently running, start + * committing that transaction before waiting for it to complete. If + * the transaction id is stale, it is by definition already completed, + * so just return SUCCESS. + */ +int jbd2_complete_transaction(journal_t *journal, tid_t tid) +{ + int need_to_wait = 1; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) { + if (journal->j_commit_request != tid) { + /* transaction not yet started, so request it */ + read_unlock(&journal->j_state_lock); + jbd2_log_start_commit(journal, tid); + goto wait_commit; + } + } else if (!(journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid)) + need_to_wait = 0; + read_unlock(&journal->j_state_lock); + if (!need_to_wait) + return 0; +wait_commit: + return jbd2_log_wait_commit(journal, tid); +} +EXPORT_SYMBOL(jbd2_complete_transaction); + +/* * Log buffer allocation routines: */ @@ -769,7 +815,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, * But we don't bother doing that, so there will be coherency problems with * mmaps of blockdevs which hold live JBD-controlled filesystems. */ -struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) +struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) { struct buffer_head *bh; unsigned long long blocknr; @@ -788,7 +834,7 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) set_buffer_uptodate(bh); unlock_buffer(bh); BUFFER_TRACE(bh, "return this buffer"); - return jbd2_journal_add_journal_head(bh); + return bh; } /* @@ -950,7 +996,7 @@ static const struct seq_operations jbd2_seq_info_ops = { static int jbd2_seq_info_open(struct inode *inode, struct file *file) { - journal_t *journal = PDE(inode)->data; + journal_t *journal = PDE_DATA(inode); struct jbd2_stats_proc_session *s; int rc, size; @@ -1033,11 +1079,10 @@ static journal_t * journal_init_common (void) return NULL; init_waitqueue_head(&journal->j_wait_transaction_locked); - init_waitqueue_head(&journal->j_wait_logspace); init_waitqueue_head(&journal->j_wait_done_commit); - init_waitqueue_head(&journal->j_wait_checkpoint); init_waitqueue_head(&journal->j_wait_commit); init_waitqueue_head(&journal->j_wait_updates); + init_waitqueue_head(&journal->j_wait_reserved); mutex_init(&journal->j_barrier); mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); @@ -1047,6 +1092,7 @@ static journal_t * journal_init_common (void) journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); journal->j_min_batch_time = 0; journal->j_max_batch_time = 15000; /* 15ms */ + atomic_set(&journal->j_reserved_credits, 0); /* The journal is marked for error until we succeed with recovery! */ journal->j_flags = JBD2_ABORT; @@ -1289,6 +1335,7 @@ static int journal_reset(journal_t *journal) static void jbd2_write_superblock(journal_t *journal, int write_op) { struct buffer_head *bh = journal->j_sb_buffer; + journal_superblock_t *sb = journal->j_superblock; int ret; trace_jbd2_write_superblock(journal, write_op); @@ -1310,6 +1357,7 @@ static void jbd2_write_superblock(journal_t *journal, int write_op) clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); } + jbd2_superblock_csum_set(journal, sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; ret = submit_bh(write_op, bh); @@ -1406,7 +1454,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal) jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", journal->j_errno); sb->s_errno = cpu_to_be32(journal->j_errno); - jbd2_superblock_csum_set(journal, sb); read_unlock(&journal->j_state_lock); jbd2_write_superblock(journal, WRITE_SYNC); @@ -2296,13 +2343,13 @@ static struct journal_head *journal_alloc_journal_head(void) #ifdef CONFIG_JBD2_DEBUG atomic_inc(&nr_journal_heads); #endif - ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); + ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); if (!ret) { jbd_debug(1, "out of memory for journal_head\n"); pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); while (!ret) { yield(); - ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); + ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); } } return ret; @@ -2364,10 +2411,8 @@ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh) struct journal_head *new_jh = NULL; repeat: - if (!buffer_jbd(bh)) { + if (!buffer_jbd(bh)) new_jh = journal_alloc_journal_head(); - memset(new_jh, 0, sizeof(*new_jh)); - } jbd_lock_bh_journal_head(bh); if (buffer_jbd(bh)) { diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 626846bac32f..d4851464b57e 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -399,18 +399,17 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, void *buf, __u32 sequence) { - __u32 provided, calculated; + __u32 csum32; if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; sequence = cpu_to_be32(sequence); - calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, - sizeof(sequence)); - calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize); - provided = be32_to_cpu(tag->t_checksum); + csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, + sizeof(sequence)); + csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); - return provided == cpu_to_be32(calculated); + return tag->t_checksum == cpu_to_be16(csum32); } static int do_one_pass(journal_t *journal, diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index f30b80b4ce8b..198c9c10276d 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -122,9 +122,10 @@ struct jbd2_revoke_table_s #ifdef __KERNEL__ static void write_one_revoke_record(journal_t *, transaction_t *, - struct journal_head **, int *, + struct list_head *, + struct buffer_head **, int *, struct jbd2_revoke_record_s *, int); -static void flush_descriptor(journal_t *, struct journal_head *, int, int); +static void flush_descriptor(journal_t *, struct buffer_head *, int, int); #endif /* Utility functions to maintain the revoke table */ @@ -531,9 +532,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal) */ void jbd2_journal_write_revoke_records(journal_t *journal, transaction_t *transaction, + struct list_head *log_bufs, int write_op) { - struct journal_head *descriptor; + struct buffer_head *descriptor; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; struct list_head *hash_list; @@ -553,7 +555,7 @@ void jbd2_journal_write_revoke_records(journal_t *journal, while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s *) hash_list->next; - write_one_revoke_record(journal, transaction, + write_one_revoke_record(journal, transaction, log_bufs, &descriptor, &offset, record, write_op); count++; @@ -573,13 +575,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal, static void write_one_revoke_record(journal_t *journal, transaction_t *transaction, - struct journal_head **descriptorp, + struct list_head *log_bufs, + struct buffer_head **descriptorp, int *offsetp, struct jbd2_revoke_record_s *record, int write_op) { int csum_size = 0; - struct journal_head *descriptor; + struct buffer_head *descriptor; int offset; journal_header_t *header; @@ -609,26 +612,26 @@ static void write_one_revoke_record(journal_t *journal, descriptor = jbd2_journal_get_descriptor_buffer(journal); if (!descriptor) return; - header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; + header = (journal_header_t *)descriptor->b_data; header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK); header->h_sequence = cpu_to_be32(transaction->t_tid); /* Record it so that we can wait for IO completion later */ - JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); - jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl); + BUFFER_TRACE(descriptor, "file in log_bufs"); + jbd2_file_log_bh(log_bufs, descriptor); offset = sizeof(jbd2_journal_revoke_header_t); *descriptorp = descriptor; } if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) { - * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) = + * ((__be64 *)(&descriptor->b_data[offset])) = cpu_to_be64(record->blocknr); offset += 8; } else { - * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = + * ((__be32 *)(&descriptor->b_data[offset])) = cpu_to_be32(record->blocknr); offset += 4; } @@ -636,8 +639,7 @@ static void write_one_revoke_record(journal_t *journal, *offsetp = offset; } -static void jbd2_revoke_csum_set(journal_t *j, - struct journal_head *descriptor) +static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) { struct jbd2_journal_revoke_tail *tail; __u32 csum; @@ -645,12 +647,10 @@ static void jbd2_revoke_csum_set(journal_t *j, if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return; - tail = (struct jbd2_journal_revoke_tail *) - (jh2bh(descriptor)->b_data + j->j_blocksize - + tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize - sizeof(struct jbd2_journal_revoke_tail)); tail->r_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, - j->j_blocksize); + csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); tail->r_checksum = cpu_to_be32(csum); } @@ -662,25 +662,24 @@ static void jbd2_revoke_csum_set(journal_t *j, */ static void flush_descriptor(journal_t *journal, - struct journal_head *descriptor, + struct buffer_head *descriptor, int offset, int write_op) { jbd2_journal_revoke_header_t *header; - struct buffer_head *bh = jh2bh(descriptor); if (is_journal_aborted(journal)) { - put_bh(bh); + put_bh(descriptor); return; } - header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data; + header = (jbd2_journal_revoke_header_t *)descriptor->b_data; header->r_count = cpu_to_be32(offset); jbd2_revoke_csum_set(journal, descriptor); - set_buffer_jwrite(bh); - BUFFER_TRACE(bh, "write"); - set_buffer_dirty(bh); - write_dirty_buffer(bh, write_op); + set_buffer_jwrite(descriptor); + BUFFER_TRACE(descriptor, "write"); + set_buffer_dirty(descriptor); + write_dirty_buffer(descriptor, write_op); } #endif diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 325bc019ed88..7aa9a32573bb 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -89,7 +89,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); atomic_set(&transaction->t_updates, 0); - atomic_set(&transaction->t_outstanding_credits, 0); + atomic_set(&transaction->t_outstanding_credits, + atomic_read(&journal->j_reserved_credits)); atomic_set(&transaction->t_handle_count, 0); INIT_LIST_HEAD(&transaction->t_inode_list); INIT_LIST_HEAD(&transaction->t_private_list); @@ -141,6 +142,112 @@ static inline void update_t_max_wait(transaction_t *transaction, } /* + * Wait until running transaction passes T_LOCKED state. Also starts the commit + * if needed. The function expects running transaction to exist and releases + * j_state_lock. + */ +static void wait_transaction_locked(journal_t *journal) + __releases(journal->j_state_lock) +{ + DEFINE_WAIT(wait); + int need_to_start; + tid_t tid = journal->j_running_transaction->t_tid; + + prepare_to_wait(&journal->j_wait_transaction_locked, &wait, + TASK_UNINTERRUPTIBLE); + need_to_start = !tid_geq(journal->j_commit_request, tid); + read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); + schedule(); + finish_wait(&journal->j_wait_transaction_locked, &wait); +} + +static void sub_reserved_credits(journal_t *journal, int blocks) +{ + atomic_sub(blocks, &journal->j_reserved_credits); + wake_up(&journal->j_wait_reserved); +} + +/* + * Wait until we can add credits for handle to the running transaction. Called + * with j_state_lock held for reading. Returns 0 if handle joined the running + * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and + * caller must retry. + */ +static int add_transaction_credits(journal_t *journal, int blocks, + int rsv_blocks) +{ + transaction_t *t = journal->j_running_transaction; + int needed; + int total = blocks + rsv_blocks; + + /* + * If the current transaction is locked down for commit, wait + * for the lock to be released. + */ + if (t->t_state == T_LOCKED) { + wait_transaction_locked(journal); + return 1; + } + + /* + * If there is not enough space left in the log to write all + * potential buffers requested by this operation, we need to + * stall pending a log checkpoint to free some more log space. + */ + needed = atomic_add_return(total, &t->t_outstanding_credits); + if (needed > journal->j_max_transaction_buffers) { + /* + * If the current transaction is already too large, + * then start to commit it: we can then go back and + * attach this handle to a new transaction. + */ + atomic_sub(total, &t->t_outstanding_credits); + wait_transaction_locked(journal); + return 1; + } + + /* + * The commit code assumes that it can get enough log space + * without forcing a checkpoint. This is *critical* for + * correctness: a checkpoint of a buffer which is also + * associated with a committing transaction creates a deadlock, + * so commit simply cannot force through checkpoints. + * + * We must therefore ensure the necessary space in the journal + * *before* starting to dirty potentially checkpointed buffers + * in the new transaction. + */ + if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) { + atomic_sub(total, &t->t_outstanding_credits); + read_unlock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); + if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) + __jbd2_log_wait_for_space(journal); + write_unlock(&journal->j_state_lock); + return 1; + } + + /* No reservation? We are done... */ + if (!rsv_blocks) + return 0; + + needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits); + /* We allow at most half of a transaction to be reserved */ + if (needed > journal->j_max_transaction_buffers / 2) { + sub_reserved_credits(journal, rsv_blocks); + atomic_sub(total, &t->t_outstanding_credits); + read_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_reserved, + atomic_read(&journal->j_reserved_credits) + rsv_blocks + <= journal->j_max_transaction_buffers / 2); + return 1; + } + return 0; +} + +/* * start_this_handle: Given a handle, deal with any locking or stalling * needed to make sure that there is enough journal space for the handle * to begin. Attach the handle to a transaction and set up the @@ -151,18 +258,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle, gfp_t gfp_mask) { transaction_t *transaction, *new_transaction = NULL; - tid_t tid; - int needed, need_to_start; - int nblocks = handle->h_buffer_credits; + int blocks = handle->h_buffer_credits; + int rsv_blocks = 0; unsigned long ts = jiffies; - if (nblocks > journal->j_max_transaction_buffers) { + /* + * 1/2 of transaction can be reserved so we can practically handle + * only 1/2 of maximum transaction size per operation + */ + if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) { printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", - current->comm, nblocks, - journal->j_max_transaction_buffers); + current->comm, blocks, + journal->j_max_transaction_buffers / 2); return -ENOSPC; } + if (handle->h_rsv_handle) + rsv_blocks = handle->h_rsv_handle->h_buffer_credits; + alloc_transaction: if (!journal->j_running_transaction) { new_transaction = kmem_cache_zalloc(transaction_cache, @@ -199,8 +312,12 @@ repeat: return -EROFS; } - /* Wait on the journal's transaction barrier if necessary */ - if (journal->j_barrier_count) { + /* + * Wait on the journal's transaction barrier if necessary. Specifically + * we allow reserved handles to proceed because otherwise commit could + * deadlock on page writeback not being able to complete. + */ + if (!handle->h_reserved && journal->j_barrier_count) { read_unlock(&journal->j_state_lock); wait_event(journal->j_wait_transaction_locked, journal->j_barrier_count == 0); @@ -213,7 +330,7 @@ repeat: goto alloc_transaction; write_lock(&journal->j_state_lock); if (!journal->j_running_transaction && - !journal->j_barrier_count) { + (handle->h_reserved || !journal->j_barrier_count)) { jbd2_get_transaction(journal, new_transaction); new_transaction = NULL; } @@ -223,85 +340,18 @@ repeat: transaction = journal->j_running_transaction; - /* - * If the current transaction is locked down for commit, wait for the - * lock to be released. - */ - if (transaction->t_state == T_LOCKED) { - DEFINE_WAIT(wait); - - prepare_to_wait(&journal->j_wait_transaction_locked, - &wait, TASK_UNINTERRUPTIBLE); - read_unlock(&journal->j_state_lock); - schedule(); - finish_wait(&journal->j_wait_transaction_locked, &wait); - goto repeat; - } - - /* - * If there is not enough space left in the log to write all potential - * buffers requested by this operation, we need to stall pending a log - * checkpoint to free some more log space. - */ - needed = atomic_add_return(nblocks, - &transaction->t_outstanding_credits); - - if (needed > journal->j_max_transaction_buffers) { + if (!handle->h_reserved) { + /* We may have dropped j_state_lock - restart in that case */ + if (add_transaction_credits(journal, blocks, rsv_blocks)) + goto repeat; + } else { /* - * If the current transaction is already too large, then start - * to commit it: we can then go back and attach this handle to - * a new transaction. + * We have handle reserved so we are allowed to join T_LOCKED + * transaction and we don't have to check for transaction size + * and journal space. */ - DEFINE_WAIT(wait); - - jbd_debug(2, "Handle %p starting new commit...\n", handle); - atomic_sub(nblocks, &transaction->t_outstanding_credits); - prepare_to_wait(&journal->j_wait_transaction_locked, &wait, - TASK_UNINTERRUPTIBLE); - tid = transaction->t_tid; - need_to_start = !tid_geq(journal->j_commit_request, tid); - read_unlock(&journal->j_state_lock); - if (need_to_start) - jbd2_log_start_commit(journal, tid); - schedule(); - finish_wait(&journal->j_wait_transaction_locked, &wait); - goto repeat; - } - - /* - * The commit code assumes that it can get enough log space - * without forcing a checkpoint. This is *critical* for - * correctness: a checkpoint of a buffer which is also - * associated with a committing transaction creates a deadlock, - * so commit simply cannot force through checkpoints. - * - * We must therefore ensure the necessary space in the journal - * *before* starting to dirty potentially checkpointed buffers - * in the new transaction. - * - * The worst part is, any transaction currently committing can - * reduce the free space arbitrarily. Be careful to account for - * those buffers when checkpointing. - */ - - /* - * @@@ AKPM: This seems rather over-defensive. We're giving commit - * a _lot_ of headroom: 1/4 of the journal plus the size of - * the committing transaction. Really, we only need to give it - * committing_transaction->t_outstanding_credits plus "enough" for - * the log control blocks. - * Also, this test is inconsistent with the matching one in - * jbd2_journal_extend(). - */ - if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { - jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); - atomic_sub(nblocks, &transaction->t_outstanding_credits); - read_unlock(&journal->j_state_lock); - write_lock(&journal->j_state_lock); - if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) - __jbd2_log_wait_for_space(journal); - write_unlock(&journal->j_state_lock); - goto repeat; + sub_reserved_credits(journal, blocks); + handle->h_reserved = 0; } /* OK, account for the buffers that this operation expects to @@ -309,15 +359,16 @@ repeat: */ update_t_max_wait(transaction, ts); handle->h_transaction = transaction; - handle->h_requested_credits = nblocks; + handle->h_requested_credits = blocks; handle->h_start_jiffies = jiffies; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); - jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", - handle, nblocks, + jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n", + handle, blocks, atomic_read(&transaction->t_outstanding_credits), - __jbd2_log_space_left(journal)); + jbd2_log_space_left(journal)); read_unlock(&journal->j_state_lock); + current->journal_info = handle; lock_map_acquire(&handle->h_lockdep_map); jbd2_journal_free_transaction(new_transaction); @@ -332,7 +383,6 @@ static handle_t *new_handle(int nblocks) handle_t *handle = jbd2_alloc_handle(GFP_NOFS); if (!handle) return NULL; - memset(handle, 0, sizeof(*handle)); handle->h_buffer_credits = nblocks; handle->h_ref = 1; @@ -349,16 +399,21 @@ static handle_t *new_handle(int nblocks) * * We make sure that the transaction can guarantee at least nblocks of * modified buffers in the log. We block until the log can guarantee - * that much space. - * - * This function is visible to journal users (like ext3fs), so is not - * called with the journal already locked. + * that much space. Additionally, if rsv_blocks > 0, we also create another + * handle with rsv_blocks reserved blocks in the journal. This handle is + * is stored in h_rsv_handle. It is not attached to any particular transaction + * and thus doesn't block transaction commit. If the caller uses this reserved + * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() + * on the parent handle will dispose the reserved one. Reserved handle has to + * be converted to a normal handle using jbd2_journal_start_reserved() before + * it can be used. * * Return a pointer to a newly allocated handle, or an ERR_PTR() value * on failure. */ -handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask, - unsigned int type, unsigned int line_no) +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, + gfp_t gfp_mask, unsigned int type, + unsigned int line_no) { handle_t *handle = journal_current_handle(); int err; @@ -375,13 +430,24 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask, handle = new_handle(nblocks); if (!handle) return ERR_PTR(-ENOMEM); + if (rsv_blocks) { + handle_t *rsv_handle; - current->journal_info = handle; + rsv_handle = new_handle(rsv_blocks); + if (!rsv_handle) { + jbd2_free_handle(handle); + return ERR_PTR(-ENOMEM); + } + rsv_handle->h_reserved = 1; + rsv_handle->h_journal = journal; + handle->h_rsv_handle = rsv_handle; + } err = start_this_handle(journal, handle, gfp_mask); if (err < 0) { + if (handle->h_rsv_handle) + jbd2_free_handle(handle->h_rsv_handle); jbd2_free_handle(handle); - current->journal_info = NULL; return ERR_PTR(err); } handle->h_type = type; @@ -396,10 +462,65 @@ EXPORT_SYMBOL(jbd2__journal_start); handle_t *jbd2_journal_start(journal_t *journal, int nblocks) { - return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0); + return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0); } EXPORT_SYMBOL(jbd2_journal_start); +void jbd2_journal_free_reserved(handle_t *handle) +{ + journal_t *journal = handle->h_journal; + + WARN_ON(!handle->h_reserved); + sub_reserved_credits(journal, handle->h_buffer_credits); + jbd2_free_handle(handle); +} +EXPORT_SYMBOL(jbd2_journal_free_reserved); + +/** + * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle + * @handle: handle to start + * + * Start handle that has been previously reserved with jbd2_journal_reserve(). + * This attaches @handle to the running transaction (or creates one if there's + * not transaction running). Unlike jbd2_journal_start() this function cannot + * block on journal commit, checkpointing, or similar stuff. It can block on + * memory allocation or frozen journal though. + * + * Return 0 on success, non-zero on error - handle is freed in that case. + */ +int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, + unsigned int line_no) +{ + journal_t *journal = handle->h_journal; + int ret = -EIO; + + if (WARN_ON(!handle->h_reserved)) { + /* Someone passed in normal handle? Just stop it. */ + jbd2_journal_stop(handle); + return ret; + } + /* + * Usefulness of mixing of reserved and unreserved handles is + * questionable. So far nobody seems to need it so just error out. + */ + if (WARN_ON(current->journal_info)) { + jbd2_journal_free_reserved(handle); + return ret; + } + + handle->h_journal = NULL; + /* + * GFP_NOFS is here because callers are likely from writeback or + * similarly constrained call sites + */ + ret = start_this_handle(journal, handle, GFP_NOFS); + if (ret < 0) + jbd2_journal_free_reserved(handle); + handle->h_type = type; + handle->h_line_no = line_no; + return ret; +} +EXPORT_SYMBOL(jbd2_journal_start_reserved); /** * int jbd2_journal_extend() - extend buffer credits. @@ -424,49 +545,53 @@ EXPORT_SYMBOL(jbd2_journal_start); int jbd2_journal_extend(handle_t *handle, int nblocks) { transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; + journal_t *journal; int result; int wanted; - result = -EIO; + WARN_ON(!transaction); if (is_handle_aborted(handle)) - goto out; + return -EROFS; + journal = transaction->t_journal; result = 1; read_lock(&journal->j_state_lock); /* Don't extend a locked-down transaction! */ - if (handle->h_transaction->t_state != T_RUNNING) { + if (transaction->t_state != T_RUNNING) { jbd_debug(3, "denied handle %p %d blocks: " "transaction not running\n", handle, nblocks); goto error_out; } spin_lock(&transaction->t_handle_lock); - wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks; + wanted = atomic_add_return(nblocks, + &transaction->t_outstanding_credits); if (wanted > journal->j_max_transaction_buffers) { jbd_debug(3, "denied handle %p %d blocks: " "transaction too large\n", handle, nblocks); + atomic_sub(nblocks, &transaction->t_outstanding_credits); goto unlock; } - if (wanted > __jbd2_log_space_left(journal)) { + if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) > + jbd2_log_space_left(journal)) { jbd_debug(3, "denied handle %p %d blocks: " "insufficient log space\n", handle, nblocks); + atomic_sub(nblocks, &transaction->t_outstanding_credits); goto unlock; } trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, - handle->h_transaction->t_tid, + transaction->t_tid, handle->h_type, handle->h_line_no, handle->h_buffer_credits, nblocks); handle->h_buffer_credits += nblocks; handle->h_requested_credits += nblocks; - atomic_add(nblocks, &transaction->t_outstanding_credits); result = 0; jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); @@ -474,7 +599,6 @@ unlock: spin_unlock(&transaction->t_handle_lock); error_out: read_unlock(&journal->j_state_lock); -out: return result; } @@ -491,19 +615,22 @@ out: * to a running handle, a call to jbd2_journal_restart will commit the * handle's transaction so far and reattach the handle to a new * transaction capabable of guaranteeing the requested number of - * credits. + * credits. We preserve reserved handle if there's any attached to the + * passed in handle. */ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) { transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; + journal_t *journal; tid_t tid; int need_to_start, ret; + WARN_ON(!transaction); /* If we've had an abort of any type, don't even think about * actually doing the restart! */ if (is_handle_aborted(handle)) return 0; + journal = transaction->t_journal; /* * First unlink the handle from its current transaction, and start the @@ -516,12 +643,18 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) spin_lock(&transaction->t_handle_lock); atomic_sub(handle->h_buffer_credits, &transaction->t_outstanding_credits); + if (handle->h_rsv_handle) { + sub_reserved_credits(journal, + handle->h_rsv_handle->h_buffer_credits); + } if (atomic_dec_and_test(&transaction->t_updates)) wake_up(&journal->j_wait_updates); + tid = transaction->t_tid; spin_unlock(&transaction->t_handle_lock); + handle->h_transaction = NULL; + current->journal_info = NULL; jbd_debug(2, "restarting handle %p\n", handle); - tid = transaction->t_tid; need_to_start = !tid_geq(journal->j_commit_request, tid); read_unlock(&journal->j_state_lock); if (need_to_start) @@ -558,6 +691,14 @@ void jbd2_journal_lock_updates(journal_t *journal) write_lock(&journal->j_state_lock); ++journal->j_barrier_count; + /* Wait until there are no reserved handles */ + if (atomic_read(&journal->j_reserved_credits)) { + write_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_reserved, + atomic_read(&journal->j_reserved_credits) == 0); + write_lock(&journal->j_state_lock); + } + /* Wait until there are no running updates */ while (1) { transaction_t *transaction = journal->j_running_transaction; @@ -620,6 +761,12 @@ static void warn_dirty_buffer(struct buffer_head *bh) bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } +static int sleep_on_shadow_bh(void *word) +{ + io_schedule(); + return 0; +} + /* * If the buffer is already part of the current transaction, then there * is nothing we need to do. If it is already part of a prior @@ -635,16 +782,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) { struct buffer_head *bh; - transaction_t *transaction; + transaction_t *transaction = handle->h_transaction; journal_t *journal; int error; char *frozen_buffer = NULL; int need_copy = 0; + unsigned long start_lock, time_lock; + WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; - - transaction = handle->h_transaction; journal = transaction->t_journal; jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); @@ -655,9 +802,16 @@ repeat: /* @@@ Need to check for errors here at some point. */ + start_lock = jiffies; lock_buffer(bh); jbd_lock_bh_state(bh); + /* If it takes too long to lock the buffer, trace it */ + time_lock = jbd2_time_diff(start_lock, jiffies); + if (time_lock > HZ/10) + trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev, + jiffies_to_msecs(time_lock)); + /* We now hold the buffer lock so it is safe to query the buffer * state. Is the buffer dirty? * @@ -747,41 +901,29 @@ repeat: * journaled. If the primary copy is already going to * disk then we cannot do copy-out here. */ - if (jh->b_jlist == BJ_Shadow) { - DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); - wait_queue_head_t *wqh; - - wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); - + if (buffer_shadow(bh)) { JBUFFER_TRACE(jh, "on shadow: sleep"); jbd_unlock_bh_state(bh); - /* commit wakes up all shadow buffers after IO */ - for ( ; ; ) { - prepare_to_wait(wqh, &wait.wait, - TASK_UNINTERRUPTIBLE); - if (jh->b_jlist != BJ_Shadow) - break; - schedule(); - } - finish_wait(wqh, &wait.wait); + wait_on_bit(&bh->b_state, BH_Shadow, + sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE); goto repeat; } - /* Only do the copy if the currently-owning transaction - * still needs it. If it is on the Forget list, the - * committing transaction is past that stage. The - * buffer had better remain locked during the kmalloc, - * but that should be true --- we hold the journal lock - * still and the buffer is already on the BUF_JOURNAL - * list so won't be flushed. + /* + * Only do the copy if the currently-owning transaction still + * needs it. If buffer isn't on BJ_Metadata list, the + * committing transaction is past that stage (here we use the + * fact that BH_Shadow is set under bh_state lock together with + * refiling to BJ_Shadow list and at this point we know the + * buffer doesn't have BH_Shadow set). * * Subtle point, though: if this is a get_undo_access, * then we will be relying on the frozen_data to contain * the new value of the committed_data record after the * transaction, so we HAVE to force the frozen_data copy - * in that case. */ - - if (jh->b_jlist != BJ_Forget || force_copy) { + * in that case. + */ + if (jh->b_jlist == BJ_Metadata || force_copy) { JBUFFER_TRACE(jh, "generate frozen data"); if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); @@ -908,14 +1050,16 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; + journal_t *journal; struct journal_head *jh = jbd2_journal_add_journal_head(bh); int err; jbd_debug(5, "journal_head %p\n", jh); + WARN_ON(!transaction); err = -EROFS; if (is_handle_aborted(handle)) goto out; + journal = transaction->t_journal; err = 0; JBUFFER_TRACE(jh, "entry"); @@ -1121,12 +1265,14 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; + journal_t *journal; struct journal_head *jh; int ret = 0; + WARN_ON(!transaction); if (is_handle_aborted(handle)) - goto out; + return -EROFS; + journal = transaction->t_journal; jh = jbd2_journal_grab_journal_head(bh); if (!jh) { ret = -EUCLEAN; @@ -1220,7 +1366,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "file as BJ_Metadata"); spin_lock(&journal->j_list_lock); - __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); + __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata); spin_unlock(&journal->j_list_lock); out_unlock_bh: jbd_unlock_bh_state(bh); @@ -1251,12 +1397,17 @@ out: int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; + journal_t *journal; struct journal_head *jh; int drop_reserve = 0; int err = 0; int was_modified = 0; + WARN_ON(!transaction); + if (is_handle_aborted(handle)) + return -EROFS; + journal = transaction->t_journal; + BUFFER_TRACE(bh, "entry"); jbd_lock_bh_state(bh); @@ -1283,7 +1434,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) */ jh->b_modified = 0; - if (jh->b_transaction == handle->h_transaction) { + if (jh->b_transaction == transaction) { J_ASSERT_JH(jh, !jh->b_frozen_data); /* If we are forgetting a buffer which is already part @@ -1378,19 +1529,21 @@ drop: int jbd2_journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - int err, wait_for_commit = 0; + journal_t *journal; + int err = 0, wait_for_commit = 0; tid_t tid; pid_t pid; + if (!transaction) + goto free_and_exit; + journal = transaction->t_journal; + J_ASSERT(journal_current_handle() == handle); if (is_handle_aborted(handle)) err = -EIO; - else { + else J_ASSERT(atomic_read(&transaction->t_updates) > 0); - err = 0; - } if (--handle->h_ref > 0) { jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, @@ -1400,7 +1553,7 @@ int jbd2_journal_stop(handle_t *handle) jbd_debug(4, "Handle %p going down\n", handle); trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, - handle->h_transaction->t_tid, + transaction->t_tid, handle->h_type, handle->h_line_no, jiffies - handle->h_start_jiffies, handle->h_sync, handle->h_requested_credits, @@ -1511,33 +1664,13 @@ int jbd2_journal_stop(handle_t *handle) lock_map_release(&handle->h_lockdep_map); + if (handle->h_rsv_handle) + jbd2_journal_free_reserved(handle->h_rsv_handle); +free_and_exit: jbd2_free_handle(handle); return err; } -/** - * int jbd2_journal_force_commit() - force any uncommitted transactions - * @journal: journal to force - * - * For synchronous operations: force any uncommitted transactions - * to disk. May seem kludgy, but it reuses all the handle batching - * code in a very simple manner. - */ -int jbd2_journal_force_commit(journal_t *journal) -{ - handle_t *handle; - int ret; - - handle = jbd2_journal_start(journal, 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - } else { - handle->h_sync = 1; - ret = jbd2_journal_stop(handle); - } - return ret; -} - /* * * List management code snippets: various functions for manipulating the @@ -1594,10 +1727,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) * Remove a buffer from the appropriate transaction list. * * Note that this function can *change* the value of - * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, - * t_log_list or t_reserved_list. If the caller is holding onto a copy of one - * of these pointers, it could go bad. Generally the caller needs to re-read - * the pointer from the transaction_t. + * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or + * t_reserved_list. If the caller is holding onto a copy of one of these + * pointers, it could go bad. Generally the caller needs to re-read the + * pointer from the transaction_t. * * Called under j_list_lock. */ @@ -1627,15 +1760,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) case BJ_Forget: list = &transaction->t_forget; break; - case BJ_IO: - list = &transaction->t_iobuf_list; - break; case BJ_Shadow: list = &transaction->t_shadow_list; break; - case BJ_LogCtl: - list = &transaction->t_log_list; - break; case BJ_Reserved: list = &transaction->t_reserved_list; break; @@ -2027,18 +2154,23 @@ zap_buffer_unlocked: * void jbd2_journal_invalidatepage() * @journal: journal to use for flush... * @page: page to flush - * @offset: length of page to invalidate. + * @offset: start of the range to invalidate + * @length: length of the range to invalidate * - * Reap page buffers containing data after offset in page. Can return -EBUSY - * if buffers are part of the committing transaction and the page is straddling - * i_size. Caller then has to wait for current commit and try again. + * Reap page buffers containing data after in the specified range in page. + * Can return -EBUSY if buffers are part of the committing transaction and + * the page is straddling i_size. Caller then has to wait for current commit + * and try again. */ int jbd2_journal_invalidatepage(journal_t *journal, struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { struct buffer_head *head, *bh, *next; + unsigned int stop = offset + length; unsigned int curr_off = 0; + int partial_page = (offset || length < PAGE_CACHE_SIZE); int may_free = 1; int ret = 0; @@ -2047,6 +2179,8 @@ int jbd2_journal_invalidatepage(journal_t *journal, if (!page_has_buffers(page)) return 0; + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + /* We will potentially be playing with lists other than just the * data lists (especially for journaled data mode), so be * cautious in our locking. */ @@ -2056,10 +2190,13 @@ int jbd2_journal_invalidatepage(journal_t *journal, unsigned int next_off = curr_off + bh->b_size; next = bh->b_this_page; + if (next_off > stop) + return 0; + if (offset <= curr_off) { /* This block is wholly outside the truncation point */ lock_buffer(bh); - ret = journal_unmap_buffer(journal, bh, offset > 0); + ret = journal_unmap_buffer(journal, bh, partial_page); unlock_buffer(bh); if (ret < 0) return ret; @@ -2070,7 +2207,7 @@ int jbd2_journal_invalidatepage(journal_t *journal, } while (bh != head); - if (!offset) { + if (!partial_page) { if (may_free && try_to_free_buffers(page)) J_ASSERT(!page_has_buffers(page)); } @@ -2131,15 +2268,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, case BJ_Forget: list = &transaction->t_forget; break; - case BJ_IO: - list = &transaction->t_iobuf_list; - break; case BJ_Shadow: list = &transaction->t_shadow_list; break; - case BJ_LogCtl: - list = &transaction->t_log_list; - break; case BJ_Reserved: list = &transaction->t_reserved_list; break; @@ -2241,10 +2372,12 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) { transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; + journal_t *journal; + WARN_ON(!transaction); if (is_handle_aborted(handle)) - return -EIO; + return -EROFS; + journal = transaction->t_journal; jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, transaction->t_tid); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index acd46a4160cb..e3aac222472e 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -22,7 +22,7 @@ #include <linux/time.h> #include "nodelist.h" -static int jffs2_readdir (struct file *, void *, filldir_t); +static int jffs2_readdir (struct file *, struct dir_context *); static int jffs2_create (struct inode *,struct dentry *,umode_t, bool); @@ -40,7 +40,7 @@ static int jffs2_rename (struct inode *, struct dentry *, const struct file_operations jffs2_dir_operations = { .read = generic_read_dir, - .readdir = jffs2_readdir, + .iterate = jffs2_readdir, .unlocked_ioctl=jffs2_ioctl, .fsync = jffs2_fsync, .llseek = generic_file_llseek, @@ -114,60 +114,40 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, /***********************************************************************/ -static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int jffs2_readdir(struct file *file, struct dir_context *ctx) { - struct jffs2_inode_info *f; - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_full_dirent *fd; - unsigned long offset, curofs; + unsigned long curofs = 1; - jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", - file_inode(filp)->i_ino); + jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", inode->i_ino); - f = JFFS2_INODE_INFO(inode); - - offset = filp->f_pos; - - if (offset == 0) { - jffs2_dbg(1, "Dirent 0: \".\", ino #%lu\n", inode->i_ino); - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) - goto out; - offset++; - } - if (offset == 1) { - unsigned long pino = parent_ino(filp->f_path.dentry); - jffs2_dbg(1, "Dirent 1: \"..\", ino #%lu\n", pino); - if (filldir(dirent, "..", 2, 1, pino, DT_DIR) < 0) - goto out; - offset++; - } + if (!dir_emit_dots(file, ctx)) + return 0; - curofs=1; mutex_lock(&f->sem); for (fd = f->dents; fd; fd = fd->next) { - curofs++; - /* First loop: curofs = 2; offset = 2 */ - if (curofs < offset) { + /* First loop: curofs = 2; pos = 2 */ + if (curofs < ctx->pos) { jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n", - fd->name, fd->ino, fd->type, curofs, offset); + fd->name, fd->ino, fd->type, curofs, (unsigned long)ctx->pos); continue; } if (!fd->ino) { jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n", fd->name); - offset++; + ctx->pos++; continue; } jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n", - offset, fd->name, fd->ino, fd->type); - if (filldir(dirent, fd->name, strlen(fd->name), offset, fd->ino, fd->type) < 0) + (unsigned long)ctx->pos, fd->name, fd->ino, fd->type); + if (!dir_emit(ctx, fd->name, strlen(fd->name), fd->ino, fd->type)) break; - offset++; + ctx->pos++; } mutex_unlock(&f->sem); - out: - filp->f_pos = offset; return 0; } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index b7dc47ba675e..730f24e282a6 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -23,6 +23,7 @@ #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/writeback.h> +#include <linux/aio.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" @@ -125,7 +126,7 @@ int jfs_write_inode(struct inode *inode, struct writeback_control *wbc) { int wait = wbc->sync_mode == WB_SYNC_ALL; - if (test_cflag(COMMIT_Nolink, inode)) + if (inode->i_nlink == 0) return 0; /* * If COMMIT_DIRTY is not set, the inode isn't really dirty. diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 9a55f53be5ff..370d7b6c5942 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -346,8 +346,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks) printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", (unsigned long long) blkno, (unsigned long long) nblocks); - jfs_error(ip->i_sb, - "dbFree: block to be freed is outside the map"); + jfs_error(ip->i_sb, "block to be freed is outside the map\n"); return -EIO; } @@ -384,7 +383,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks) /* free the blocks. */ if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) { - jfs_error(ip->i_sb, "dbFree: error in block map\n"); + jfs_error(ip->i_sb, "error in block map\n"); release_metapage(mp); IREAD_UNLOCK(ipbmap); return (rc); @@ -441,8 +440,7 @@ dbUpdatePMap(struct inode *ipbmap, printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", (unsigned long long) blkno, (unsigned long long) nblocks); - jfs_error(ipbmap->i_sb, - "dbUpdatePMap: blocks are outside the map"); + jfs_error(ipbmap->i_sb, "blocks are outside the map\n"); return -EIO; } @@ -726,7 +724,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) /* the hint should be within the map */ if (hint >= mapSize) { - jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map"); + jfs_error(ip->i_sb, "the hint is outside the map\n"); return -EIO; } @@ -1057,8 +1055,7 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) bmp = sbi->bmap; if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) { IREAD_UNLOCK(ipbmap); - jfs_error(ip->i_sb, - "dbExtend: the block is outside the filesystem"); + jfs_error(ip->i_sb, "the block is outside the filesystem\n"); return -EIO; } @@ -1134,8 +1131,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, u32 mask; if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { - jfs_error(bmp->db_ipbmap->i_sb, - "dbAllocNext: Corrupt dmap page"); + jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n"); return -EIO; } @@ -1265,8 +1261,7 @@ dbAllocNear(struct bmap * bmp, s8 *leaf; if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { - jfs_error(bmp->db_ipbmap->i_sb, - "dbAllocNear: Corrupt dmap page"); + jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n"); return -EIO; } @@ -1381,8 +1376,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) */ if (l2nb > bmp->db_agl2size) { jfs_error(bmp->db_ipbmap->i_sb, - "dbAllocAG: allocation request is larger than the " - "allocation group size"); + "allocation request is larger than the allocation group size\n"); return -EIO; } @@ -1417,7 +1411,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) (unsigned long long) blkno, (unsigned long long) nblocks); jfs_error(bmp->db_ipbmap->i_sb, - "dbAllocAG: dbAllocCtl failed in free AG"); + "dbAllocCtl failed in free AG\n"); } return (rc); } @@ -1433,8 +1427,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) budmin = dcp->budmin; if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { - jfs_error(bmp->db_ipbmap->i_sb, - "dbAllocAG: Corrupt dmapctl page"); + jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n"); release_metapage(mp); return -EIO; } @@ -1475,7 +1468,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) } if (n == 4) { jfs_error(bmp->db_ipbmap->i_sb, - "dbAllocAG: failed descending stree"); + "failed descending stree\n"); release_metapage(mp); return -EIO; } @@ -1515,8 +1508,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) &blkno))) { if (rc == -ENOSPC) { jfs_error(bmp->db_ipbmap->i_sb, - "dbAllocAG: control page " - "inconsistent"); + "control page inconsistent\n"); return -EIO; } return (rc); @@ -1528,7 +1520,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); if (rc == -ENOSPC) { jfs_error(bmp->db_ipbmap->i_sb, - "dbAllocAG: unable to allocate blocks"); + "unable to allocate blocks\n"); rc = -EIO; } return (rc); @@ -1587,8 +1579,7 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results) */ rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); if (rc == -ENOSPC) { - jfs_error(bmp->db_ipbmap->i_sb, - "dbAllocAny: unable to allocate blocks"); + jfs_error(bmp->db_ipbmap->i_sb, "unable to allocate blocks\n"); return -EIO; } return (rc); @@ -1652,8 +1643,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen) range_cnt = min_t(u64, max_ranges + 1, 32 * 1024); totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS); if (totrim == NULL) { - jfs_error(bmp->db_ipbmap->i_sb, - "dbDiscardAG: no memory for trim array"); + jfs_error(bmp->db_ipbmap->i_sb, "no memory for trim array\n"); IWRITE_UNLOCK(ipbmap); return 0; } @@ -1682,8 +1672,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen) nblocks = 1 << l2nb; } else { /* Trim any already allocated blocks */ - jfs_error(bmp->db_ipbmap->i_sb, - "dbDiscardAG: -EIO"); + jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n"); break; } @@ -1761,7 +1750,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { jfs_error(bmp->db_ipbmap->i_sb, - "dbFindCtl: Corrupt dmapctl page"); + "Corrupt dmapctl page\n"); release_metapage(mp); return -EIO; } @@ -1782,7 +1771,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) if (rc) { if (lev != level) { jfs_error(bmp->db_ipbmap->i_sb, - "dbFindCtl: dmap inconsistent"); + "dmap inconsistent\n"); return -EIO; } return -ENOSPC; @@ -1906,7 +1895,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) if (dp->tree.stree[ROOT] != L2BPERDMAP) { release_metapage(mp); jfs_error(bmp->db_ipbmap->i_sb, - "dbAllocCtl: the dmap is not all free"); + "the dmap is not all free\n"); rc = -EIO; goto backout; } @@ -1953,7 +1942,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) * to indicate that we have leaked blocks. */ jfs_error(bmp->db_ipbmap->i_sb, - "dbAllocCtl: I/O Error: Block Leakage."); + "I/O Error: Block Leakage\n"); continue; } dp = (struct dmap *) mp->data; @@ -1965,8 +1954,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) * to indicate that we have leaked blocks. */ release_metapage(mp); - jfs_error(bmp->db_ipbmap->i_sb, - "dbAllocCtl: Block Leakage."); + jfs_error(bmp->db_ipbmap->i_sb, "Block Leakage\n"); continue; } @@ -2263,8 +2251,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, for (; nwords > 0; nwords -= nw) { if (leaf[word] < BUDMIN) { jfs_error(bmp->db_ipbmap->i_sb, - "dbAllocBits: leaf page " - "corrupt"); + "leaf page corrupt\n"); break; } @@ -2536,8 +2523,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) dcp = (struct dmapctl *) mp->data; if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { - jfs_error(bmp->db_ipbmap->i_sb, - "dbAdjCtl: Corrupt dmapctl page"); + jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n"); release_metapage(mp); return -EIO; } @@ -2638,8 +2624,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) assert(level == bmp->db_maxlevel); if (bmp->db_maxfreebud != oldroot) { jfs_error(bmp->db_ipbmap->i_sb, - "dbAdjCtl: the maximum free buddy is " - "not the old root"); + "the maximum free buddy is not the old root\n"); } bmp->db_maxfreebud = dcp->stree[ROOT]; } @@ -3481,7 +3466,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) p = BMAPBLKNO + nbperpage; /* L2 page */ l2mp = read_metapage(ipbmap, p, PSIZE, 0); if (!l2mp) { - jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read"); + jfs_error(ipbmap->i_sb, "L2 page could not be read\n"); return -EIO; } l2dcp = (struct dmapctl *) l2mp->data; @@ -3646,8 +3631,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) } } /* for each L1 in a L2 */ - jfs_error(ipbmap->i_sb, - "dbExtendFS: function has not returned as expected"); + jfs_error(ipbmap->i_sb, "function has not returned as expected\n"); errout: if (l0mp) release_metapage(l0mp); @@ -3717,7 +3701,7 @@ void dbFinalizeBmap(struct inode *ipbmap) } if (bmp->db_agpref >= bmp->db_numag) { jfs_error(ipbmap->i_sb, - "cannot find ag with average freespace"); + "cannot find ag with average freespace\n"); } } diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 0ddbeceafc62..8743ba9c6742 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -124,21 +124,21 @@ struct dtsplit { #define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot) /* get page buffer for specified block address */ -#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ -{\ - BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\ - if (!(RC))\ - {\ - if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\ - ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\ - {\ - BT_PUTPAGE(MP);\ - jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\ - MP = NULL;\ - RC = -EIO;\ - }\ - }\ -} +#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC) \ +do { \ + BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot); \ + if (!(RC)) { \ + if (((P)->header.nextindex > \ + (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \ + ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \ + BT_PUTPAGE(MP); \ + jfs_error((IP)->i_sb, \ + "DT_GETPAGE: dtree page corrupt\n"); \ + MP = NULL; \ + RC = -EIO; \ + } \ + } \ +} while (0) /* for consistency */ #define DT_PUTPAGE(MP) BT_PUTPAGE(MP) @@ -776,7 +776,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, /* Something's corrupted, mark filesystem dirty so * chkdsk will fix it. */ - jfs_error(sb, "stack overrun in dtSearch!"); + jfs_error(sb, "stack overrun!\n"); BT_STACK_DUMP(btstack); rc = -EIO; goto out; @@ -3002,9 +3002,9 @@ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent) * return: offset = (pn, index) of start entry * of next jfs_readdir()/dtRead() */ -int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +int jfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *ip = file_inode(filp); + struct inode *ip = file_inode(file); struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; int rc = 0; loff_t dtpos; /* legacy OS/2 style position */ @@ -3033,7 +3033,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) int overflow, fix_page, page_fixed = 0; static int unique_pos = 2; /* If we can't fix broken index */ - if (filp->f_pos == DIREND) + if (ctx->pos == DIREND) return 0; if (DO_INDEX(ip)) { @@ -3045,7 +3045,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) */ do_index = 1; - dir_index = (u32) filp->f_pos; + dir_index = (u32) ctx->pos; if (dir_index > 1) { struct dir_table_slot dirtab_slot; @@ -3053,25 +3053,25 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (dtEmpty(ip) || (dir_index >= JFS_IP(ip)->next_index)) { /* Stale position. Directory has shrunk */ - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } repeat: rc = read_index(ip, dir_index, &dirtab_slot); if (rc) { - filp->f_pos = DIREND; + ctx->pos = DIREND; return rc; } if (dirtab_slot.flag == DIR_INDEX_FREE) { if (loop_count++ > JFS_IP(ip)->next_index) { jfs_err("jfs_readdir detected " "infinite loop!"); - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } dir_index = le32_to_cpu(dirtab_slot.addr2); if (dir_index == -1) { - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } goto repeat; @@ -3080,13 +3080,13 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) index = dirtab_slot.slot; DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) { - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } if (p->header.flag & BT_INTERNAL) { jfs_err("jfs_readdir: bad index table"); DT_PUTPAGE(mp); - filp->f_pos = -1; + ctx->pos = -1; return 0; } } else { @@ -3094,23 +3094,22 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) /* * self "." */ - filp->f_pos = 0; - if (filldir(dirent, ".", 1, 0, ip->i_ino, - DT_DIR)) + ctx->pos = 0; + if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; } /* * parent ".." */ - filp->f_pos = 1; - if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR)) + ctx->pos = 1; + if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; /* * Find first entry of left-most leaf */ if (dtEmpty(ip)) { - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } @@ -3128,23 +3127,19 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) * pn > 0: Real entries, pn=1 -> leftmost page * pn = index = -1: No more entries */ - dtpos = filp->f_pos; + dtpos = ctx->pos; if (dtpos == 0) { /* build "." entry */ - - if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino, - DT_DIR)) + if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; dtoffset->index = 1; - filp->f_pos = dtpos; + ctx->pos = dtpos; } if (dtoffset->pn == 0) { if (dtoffset->index == 1) { /* build ".." entry */ - - if (filldir(dirent, "..", 2, filp->f_pos, - PARENT(ip), DT_DIR)) + if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; } else { jfs_err("jfs_readdir called with " @@ -3152,18 +3147,18 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) } dtoffset->pn = 1; dtoffset->index = 0; - filp->f_pos = dtpos; + ctx->pos = dtpos; } if (dtEmpty(ip)) { - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } - if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) { + if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) { jfs_err("jfs_readdir: unexpected rc = %d " "from dtReadNext", rc); - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } /* get start leaf page and index */ @@ -3171,7 +3166,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) /* offset beyond directory eof ? */ if (bn < 0) { - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } } @@ -3180,7 +3175,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (dirent_buf == 0) { DT_PUTPAGE(mp); jfs_warn("jfs_readdir: __get_free_page failed!"); - filp->f_pos = DIREND; + ctx->pos = DIREND; return -ENOMEM; } @@ -3252,8 +3247,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) /* Sanity Check */ if (d_namleft == 0) { jfs_error(ip->i_sb, - "JFS:Dtree error: ino = " - "%ld, bn=%Ld, index = %d", + "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n", (long)ip->i_ino, (long long)bn, i); @@ -3295,9 +3289,9 @@ skip_one: jfs_dirent = (struct jfs_dirent *) dirent_buf; while (jfs_dirents--) { - filp->f_pos = jfs_dirent->position; - if (filldir(dirent, jfs_dirent->name, - jfs_dirent->name_len, filp->f_pos, + ctx->pos = jfs_dirent->position; + if (!dir_emit(ctx, jfs_dirent->name, + jfs_dirent->name_len, jfs_dirent->ino, DT_UNKNOWN)) goto out; jfs_dirent = next_jfs_dirent(jfs_dirent); @@ -3309,7 +3303,7 @@ skip_one: } if (!overflow && (bn == 0)) { - filp->f_pos = DIREND; + ctx->pos = DIREND; break; } @@ -3373,7 +3367,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack) */ if (BT_STACK_FULL(btstack)) { DT_PUTPAGE(mp); - jfs_error(ip->i_sb, "dtReadFirst: btstack overrun"); + jfs_error(ip->i_sb, "btstack overrun\n"); BT_STACK_DUMP(btstack); return -EIO; } diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h index 2545bb317235..fd4169e6e698 100644 --- a/fs/jfs/jfs_dtree.h +++ b/fs/jfs/jfs_dtree.h @@ -265,5 +265,5 @@ extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key, extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag); -extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir); +extern int jfs_readdir(struct file *file, struct dir_context *ctx); #endif /* !_H_JFS_DTREE */ diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index e5fe8506ed16..2ae7d59ab10a 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -388,7 +388,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp) if ((rc == 0) && xlen) { if (xlen != nbperpage) { - jfs_error(ip->i_sb, "extHint: corrupt xtree"); + jfs_error(ip->i_sb, "corrupt xtree\n"); rc = -EIO; } XADaddress(xp, xaddr); diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 6ba4006e011b..f321986e73d2 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -386,7 +386,7 @@ int diRead(struct inode *ip) dp += rel_inode; if (ip->i_ino != le32_to_cpu(dp->di_number)) { - jfs_error(ip->i_sb, "diRead: i_ino != di_number"); + jfs_error(ip->i_sb, "i_ino != di_number\n"); rc = -EIO; } else if (le32_to_cpu(dp->di_nlink) == 0) rc = -ESTALE; @@ -625,7 +625,7 @@ int diWrite(tid_t tid, struct inode *ip) if (!addressPXD(&(jfs_ip->ixpxd)) || (lengthPXD(&(jfs_ip->ixpxd)) != JFS_IP(ipimap)->i_imap->im_nbperiext)) { - jfs_error(ip->i_sb, "diWrite: ixpxd invalid"); + jfs_error(ip->i_sb, "ixpxd invalid\n"); return -EIO; } @@ -893,8 +893,7 @@ int diFree(struct inode *ip) if (iagno >= imap->im_nextiag) { print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4, imap, 32, 0); - jfs_error(ip->i_sb, - "diFree: inum = %d, iagno = %d, nextiag = %d", + jfs_error(ip->i_sb, "inum = %d, iagno = %d, nextiag = %d\n", (uint) inum, iagno, imap->im_nextiag); return -EIO; } @@ -930,15 +929,14 @@ int diFree(struct inode *ip) mask = HIGHORDER >> bitno; if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { - jfs_error(ip->i_sb, - "diFree: wmap shows inode already free"); + jfs_error(ip->i_sb, "wmap shows inode already free\n"); } if (!addressPXD(&iagp->inoext[extno])) { release_metapage(mp); IREAD_UNLOCK(ipimap); AG_UNLOCK(imap, agno); - jfs_error(ip->i_sb, "diFree: invalid inoext"); + jfs_error(ip->i_sb, "invalid inoext\n"); return -EIO; } @@ -950,7 +948,7 @@ int diFree(struct inode *ip) release_metapage(mp); IREAD_UNLOCK(ipimap); AG_UNLOCK(imap, agno); - jfs_error(ip->i_sb, "diFree: numfree > numinos"); + jfs_error(ip->i_sb, "numfree > numinos\n"); return -EIO; } /* @@ -1199,7 +1197,7 @@ int diFree(struct inode *ip) * for the inode being freed. */ if (iagp->pmap[extno] != 0) { - jfs_error(ip->i_sb, "diFree: the pmap does not show inode free"); + jfs_error(ip->i_sb, "the pmap does not show inode free\n"); } iagp->wmap[extno] = 0; PXDlength(&iagp->inoext[extno], 0); @@ -1493,7 +1491,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) /* mask any prior bits for the starting words of the * summary map. */ - mask = ONES << (EXTSPERSUM - bitno); + mask = (bitno == 0) ? 0 : (ONES << (EXTSPERSUM - bitno)); inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask; extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask; @@ -1518,8 +1516,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) release_metapage(mp); AG_UNLOCK(imap, agno); jfs_error(ip->i_sb, - "diAlloc: can't find free bit " - "in wmap"); + "can't find free bit in wmap\n"); return -EIO; } @@ -1660,7 +1657,7 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip) numinos = imap->im_agctl[agno].numinos; if (numfree > numinos) { - jfs_error(ip->i_sb, "diAllocAG: numfree > numinos"); + jfs_error(ip->i_sb, "numfree > numinos\n"); return -EIO; } @@ -1811,8 +1808,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) if (!iagp->nfreeinos) { IREAD_UNLOCK(imap->im_ipimap); release_metapage(mp); - jfs_error(ip->i_sb, - "diAllocIno: nfreeinos = 0, but iag on freelist"); + jfs_error(ip->i_sb, "nfreeinos = 0, but iag on freelist\n"); return -EIO; } @@ -1824,7 +1820,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) IREAD_UNLOCK(imap->im_ipimap); release_metapage(mp); jfs_error(ip->i_sb, - "diAllocIno: free inode not found in summary map"); + "free inode not found in summary map\n"); return -EIO; } @@ -1839,7 +1835,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) if (rem >= EXTSPERSUM) { IREAD_UNLOCK(imap->im_ipimap); release_metapage(mp); - jfs_error(ip->i_sb, "diAllocIno: no free extent found"); + jfs_error(ip->i_sb, "no free extent found\n"); return -EIO; } extno = (sword << L2EXTSPERSUM) + rem; @@ -1850,7 +1846,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) if (rem >= INOSPEREXT) { IREAD_UNLOCK(imap->im_ipimap); release_metapage(mp); - jfs_error(ip->i_sb, "diAllocIno: free inode not found"); + jfs_error(ip->i_sb, "free inode not found\n"); return -EIO; } @@ -1936,7 +1932,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); if ((rc = diIAGRead(imap, iagno, &mp))) { IREAD_UNLOCK(imap->im_ipimap); - jfs_error(ip->i_sb, "diAllocExt: error reading iag"); + jfs_error(ip->i_sb, "error reading iag\n"); return rc; } iagp = (struct iag *) mp->data; @@ -1948,8 +1944,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) if (sword >= SMAPSZ) { release_metapage(mp); IREAD_UNLOCK(imap->im_ipimap); - jfs_error(ip->i_sb, - "diAllocExt: free ext summary map not found"); + jfs_error(ip->i_sb, "free ext summary map not found\n"); return -EIO; } if (~iagp->extsmap[sword]) @@ -1962,7 +1957,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) if (rem >= EXTSPERSUM) { release_metapage(mp); IREAD_UNLOCK(imap->im_ipimap); - jfs_error(ip->i_sb, "diAllocExt: free extent not found"); + jfs_error(ip->i_sb, "free extent not found\n"); return -EIO; } extno = (sword << L2EXTSPERSUM) + rem; @@ -2081,8 +2076,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) if (bmp) release_metapage(bmp); - jfs_error(imap->im_ipimap->i_sb, - "diAllocBit: iag inconsistent"); + jfs_error(imap->im_ipimap->i_sb, "iag inconsistent\n"); return -EIO; } @@ -2189,7 +2183,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) /* better have free extents. */ if (!iagp->nfreeexts) { - jfs_error(imap->im_ipimap->i_sb, "diNewExt: no free extents"); + jfs_error(imap->im_ipimap->i_sb, "no free extents\n"); return -EIO; } @@ -2261,7 +2255,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) } if (ciagp == NULL) { jfs_error(imap->im_ipimap->i_sb, - "diNewExt: ciagp == NULL"); + "ciagp == NULL\n"); rc = -EIO; goto error_out; } @@ -2498,7 +2492,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) IWRITE_UNLOCK(ipimap); IAGFREE_UNLOCK(imap); jfs_error(imap->im_ipimap->i_sb, - "diNewIAG: ipimap->i_size is wrong"); + "ipimap->i_size is wrong\n"); return -EIO; } @@ -2758,8 +2752,7 @@ diUpdatePMap(struct inode *ipimap, iagno = INOTOIAG(inum); /* make sure that the iag is contained within the map */ if (iagno >= imap->im_nextiag) { - jfs_error(ipimap->i_sb, - "diUpdatePMap: the iag is outside the map"); + jfs_error(ipimap->i_sb, "the iag is outside the map\n"); return -EIO; } /* read the iag */ @@ -2788,13 +2781,13 @@ diUpdatePMap(struct inode *ipimap, */ if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { jfs_error(ipimap->i_sb, - "diUpdatePMap: inode %ld not marked as " - "allocated in wmap!", inum); + "inode %ld not marked as allocated in wmap!\n", + inum); } if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) { jfs_error(ipimap->i_sb, - "diUpdatePMap: inode %ld not marked as " - "allocated in pmap!", inum); + "inode %ld not marked as allocated in pmap!\n", + inum); } /* update the bitmap for the extent of the freed inode */ iagp->pmap[extno] &= cpu_to_le32(~mask); @@ -2809,15 +2802,13 @@ diUpdatePMap(struct inode *ipimap, if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { release_metapage(mp); jfs_error(ipimap->i_sb, - "diUpdatePMap: the inode is not allocated in " - "the working map"); + "the inode is not allocated in the working map\n"); return -EIO; } if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) { release_metapage(mp); jfs_error(ipimap->i_sb, - "diUpdatePMap: the inode is not free in the " - "persistent map"); + "the inode is not free in the persistent map\n"); return -EIO; } /* update the bitmap for the extent of the allocated inode */ @@ -2909,8 +2900,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap) iagp = (struct iag *) bp->data; if (le32_to_cpu(iagp->iagnum) != i) { release_metapage(bp); - jfs_error(ipimap->i_sb, - "diExtendFs: unexpected value of iagnum"); + jfs_error(ipimap->i_sb, "unexpected value of iagnum\n"); return -EIO; } @@ -2986,8 +2976,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap) if (xnuminos != atomic_read(&imap->im_numinos) || xnumfree != atomic_read(&imap->im_numfree)) { - jfs_error(ipimap->i_sb, - "diExtendFs: numinos or numfree incorrect"); + jfs_error(ipimap->i_sb, "numinos or numfree incorrect\n"); return -EIO; } diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 2eb952c41a69..360d27c48887 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1058,7 +1058,8 @@ static int lmLogSync(struct jfs_log * log, int hard_sync) */ void jfs_syncpt(struct jfs_log *log, int hard_sync) { LOG_LOCK(log); - lmLogSync(log, hard_sync); + if (!test_bit(log_QUIESCE, &log->flag)) + lmLogSync(log, hard_sync); LOG_UNLOCK(log); } @@ -2004,12 +2005,17 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_io_vec[0].bv_offset = bp->l_offset; bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = LOGPSIZE; bio->bi_end_io = lbmIODone; bio->bi_private = bp; - submit_bio(READ_SYNC, bio); + /*check if journaling to disk has been disabled*/ + if (log->no_integrity) { + bio->bi_size = 0; + lbmIODone(bio, 0); + } else { + submit_bio(READ_SYNC, bio); + } wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); @@ -2145,7 +2151,6 @@ static void lbmStartIO(struct lbuf * bp) bio->bi_io_vec[0].bv_offset = bp->l_offset; bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = LOGPSIZE; bio->bi_end_io = lbmIODone; diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 6740d34cd82b..d165cde0c68d 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -571,9 +571,10 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask) return ret; } -static void metapage_invalidatepage(struct page *page, unsigned long offset) +static void metapage_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { - BUG_ON(offset); + BUG_ON(offset || length < PAGE_CACHE_SIZE); BUG_ON(PageWriteback(page)); @@ -646,7 +647,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, if (mp) { if (mp->logical_size != size) { jfs_error(inode->i_sb, - "__get_metapage: mp->logical_size != size"); + "get_mp->logical_size != size\n"); jfs_err("logical_size = %d, size = %d", mp->logical_size, size); dump_stack(); @@ -657,8 +658,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, if (test_bit(META_discard, &mp->flag)) { if (!new) { jfs_error(inode->i_sb, - "__get_metapage: using a " - "discarded metapage"); + "using a discarded metapage\n"); discard_metapage(mp); goto unlock; } diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h index 884fc21ab8ee..04847b8d3070 100644 --- a/fs/jfs/jfs_superblock.h +++ b/fs/jfs/jfs_superblock.h @@ -108,6 +108,7 @@ struct jfs_superblock { extern int readSuper(struct super_block *, struct buffer_head **); extern int updateSuper(struct super_block *, uint); +__printf(2, 3) extern void jfs_error(struct super_block *, const char *, ...); extern int jfs_mount(struct super_block *); extern int jfs_mount_rw(struct super_block *, int); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 5fcc02eaa64c..564c4f279ac6 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2684,7 +2684,7 @@ void txAbort(tid_t tid, int dirty) * mark filesystem dirty */ if (dirty) - jfs_error(tblk->sb, "txAbort"); + jfs_error(tblk->sb, "\n"); return; } diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index 6c50871e6220..5ad7748860ce 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -64,22 +64,23 @@ /* get page buffer for specified block address */ /* ToDo: Replace this ugly macro with a function */ -#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ -{\ - BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\ - if (!(RC))\ - {\ - if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\ - (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\ - (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\ - {\ - jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\ - BT_PUTPAGE(MP);\ - MP = NULL;\ - RC = -EIO;\ - }\ - }\ -} +#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC) \ +do { \ + BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot); \ + if (!(RC)) { \ + if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \ + (le16_to_cpu((P)->header.nextindex) > \ + le16_to_cpu((P)->header.maxentry)) || \ + (le16_to_cpu((P)->header.maxentry) > \ + (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \ + jfs_error((IP)->i_sb, \ + "XT_GETPAGE: xtree page corrupt\n"); \ + BT_PUTPAGE(MP); \ + MP = NULL; \ + RC = -EIO; \ + } \ + } \ +} while (0) /* for consistency */ #define XT_PUTPAGE(MP) BT_PUTPAGE(MP) @@ -499,7 +500,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, /* push (bn, index) of the parent page/entry */ if (BT_STACK_FULL(btstack)) { - jfs_error(ip->i_sb, "stack overrun in xtSearch!"); + jfs_error(ip->i_sb, "stack overrun!\n"); XT_PUTPAGE(mp); return -EIO; } @@ -1385,7 +1386,7 @@ int xtExtend(tid_t tid, /* transaction id */ if (cmp != 0) { XT_PUTPAGE(mp); - jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent"); + jfs_error(ip->i_sb, "xtSearch did not find extent\n"); return -EIO; } @@ -1393,7 +1394,7 @@ int xtExtend(tid_t tid, /* transaction id */ xad = &p->xad[index]; if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) { XT_PUTPAGE(mp); - jfs_error(ip->i_sb, "xtExtend: extension is not contiguous"); + jfs_error(ip->i_sb, "extension is not contiguous\n"); return -EIO; } @@ -1552,7 +1553,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", if (cmp != 0) { XT_PUTPAGE(mp); - jfs_error(ip->i_sb, "xtTailgate: couldn't find extent"); + jfs_error(ip->i_sb, "couldn't find extent\n"); return -EIO; } @@ -1560,8 +1561,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", nextindex = le16_to_cpu(p->header.nextindex); if (index != nextindex - 1) { XT_PUTPAGE(mp); - jfs_error(ip->i_sb, - "xtTailgate: the entry found is not the last entry"); + jfs_error(ip->i_sb, "the entry found is not the last entry\n"); return -EIO; } @@ -1734,7 +1734,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) if (cmp != 0) { XT_PUTPAGE(mp); - jfs_error(ip->i_sb, "xtUpdate: Could not find extent"); + jfs_error(ip->i_sb, "Could not find extent\n"); return -EIO; } @@ -1758,7 +1758,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) (nxoff + nxlen > xoff + xlen)) { XT_PUTPAGE(mp); jfs_error(ip->i_sb, - "xtUpdate: nXAD in not completely contained within XAD"); + "nXAD in not completely contained within XAD\n"); return -EIO; } @@ -1907,7 +1907,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) if (xoff >= nxoff) { XT_PUTPAGE(mp); - jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff"); + jfs_error(ip->i_sb, "xoff >= nxoff\n"); return -EIO; } /* #endif _JFS_WIP_COALESCE */ @@ -2048,14 +2048,13 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) if (cmp != 0) { XT_PUTPAGE(mp); - jfs_error(ip->i_sb, "xtUpdate: xtSearch failed"); + jfs_error(ip->i_sb, "xtSearch failed\n"); return -EIO; } if (index0 != index) { XT_PUTPAGE(mp); - jfs_error(ip->i_sb, - "xtUpdate: unexpected value of index"); + jfs_error(ip->i_sb, "unexpected value of index\n"); return -EIO; } } @@ -3650,7 +3649,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) getChild: /* save current parent entry for the child page */ if (BT_STACK_FULL(&btstack)) { - jfs_error(ip->i_sb, "stack overrun in xtTruncate!"); + jfs_error(ip->i_sb, "stack overrun!\n"); XT_PUTPAGE(mp); return -EIO; } @@ -3751,8 +3750,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) if (cmp != 0) { XT_PUTPAGE(mp); - jfs_error(ip->i_sb, - "xtTruncate_pmap: did not find extent"); + jfs_error(ip->i_sb, "did not find extent\n"); return -EIO; } } else { @@ -3851,7 +3849,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) getChild: /* save current parent entry for the child page */ if (BT_STACK_FULL(&btstack)) { - jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!"); + jfs_error(ip->i_sb, "stack overrun!\n"); XT_PUTPAGE(mp); return -EIO; } diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 3b91a7ad6086..aa8a3370631b 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1176,7 +1176,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!S_ISDIR(old_ip->i_mode) && new_ip) IWRITE_UNLOCK(new_ip); jfs_error(new_ip->i_sb, - "jfs_rename: new_ip->i_nlink != 0"); + "new_ip->i_nlink != 0\n"); return -EIO; } tblk = tid_to_tblock(tid); @@ -1529,7 +1529,7 @@ const struct inode_operations jfs_dir_inode_operations = { const struct file_operations jfs_dir_operations = { .read = generic_read_dir, - .readdir = jfs_readdir, + .iterate = jfs_readdir, .fsync = jfs_fsync, .unlocked_ioctl = jfs_ioctl, #ifdef CONFIG_COMPAT @@ -1538,8 +1538,7 @@ const struct file_operations jfs_dir_operations = { .llseek = generic_file_llseek, }; -static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode, - struct qstr *this) +static int jfs_ci_hash(const struct dentry *dir, struct qstr *this) { unsigned long hash; int i; @@ -1552,9 +1551,7 @@ static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode, return 0; } -static int jfs_ci_compare(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int jfs_ci_compare(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { int i, result = 1; diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index 8d0c1c7c0820..90b3bc21e9b0 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -530,7 +530,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) goto resume; error_out: - jfs_error(sb, "jfs_extendfs"); + jfs_error(sb, "\n"); resume: /* diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 2003e830ed1c..6669aa2042c3 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -92,16 +92,20 @@ static void jfs_handle_error(struct super_block *sb) /* nothing is done for continue beyond marking the superblock dirty */ } -void jfs_error(struct super_block *sb, const char * function, ...) +void jfs_error(struct super_block *sb, const char *fmt, ...) { - static char error_buf[256]; + struct va_format vaf; va_list args; - va_start(args, function); - vsnprintf(error_buf, sizeof(error_buf), function, args); - va_end(args); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("ERROR: (device %s): %pf: %pV\n", + sb->s_id, __builtin_return_address(0), &vaf); - pr_err("ERROR: (device %s): %s\n", sb->s_id, error_buf); + va_end(args); jfs_handle_error(sb); } @@ -611,11 +615,28 @@ static int jfs_freeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; + int rc = 0; if (!(sb->s_flags & MS_RDONLY)) { txQuiesce(sb); - lmLogShutdown(log); - updateSuper(sb, FM_CLEAN); + rc = lmLogShutdown(log); + if (rc) { + jfs_error(sb, "lmLogShutdown failed\n"); + + /* let operations fail rather than hang */ + txResume(sb); + + return rc; + } + rc = updateSuper(sb, FM_CLEAN); + if (rc) { + jfs_err("jfs_freeze: updateSuper failed\n"); + /* + * Don't fail here. Everything succeeded except + * marking the superblock clean, so there's really + * no harm in leaving it frozen for now. + */ + } } return 0; } @@ -627,13 +648,18 @@ static int jfs_unfreeze(struct super_block *sb) int rc = 0; if (!(sb->s_flags & MS_RDONLY)) { - updateSuper(sb, FM_MOUNT); - if ((rc = lmLogInit(log))) - jfs_err("jfs_unlock failed with return code %d", rc); - else - txResume(sb); + rc = updateSuper(sb, FM_MOUNT); + if (rc) { + jfs_error(sb, "updateSuper failed\n"); + goto out; + } + rc = lmLogInit(log); + if (rc) + jfs_error(sb, "lmLogInit failed\n"); +out: + txResume(sb); } - return 0; + return rc; } static struct dentry *jfs_do_mount(struct file_system_type *fs_type, diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 42d67f9757bf..d3472f4cd530 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -382,7 +382,7 @@ static int ea_read(struct inode *ip, struct jfs_ea_list *ealist) nbytes = sizeDXD(&ji->ea); if (!nbytes) { - jfs_error(sb, "ea_read: nbytes is 0"); + jfs_error(sb, "nbytes is 0\n"); return -EIO; } @@ -482,7 +482,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) current_blocks = 0; } else { if (!(ji->ea.flag & DXD_EXTENT)) { - jfs_error(sb, "ea_get: invalid ea.flag)"); + jfs_error(sb, "invalid ea.flag\n"); return -EIO; } current_blocks = (ea_size + sb->s_blocksize - 1) >> @@ -1089,8 +1089,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name) } #ifdef CONFIG_JFS_SECURITY -int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { const struct xattr *xattr; tid_t *tid = fs_info; diff --git a/fs/libfs.c b/fs/libfs.c index 916da8c4158b..c3a0837fb861 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -135,60 +135,40 @@ static inline unsigned char dt_type(struct inode *inode) * both impossible due to the lock on directory. */ -int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) +int dcache_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; - struct dentry *cursor = filp->private_data; + struct dentry *dentry = file->f_path.dentry; + struct dentry *cursor = file->private_data; struct list_head *p, *q = &cursor->d_u.d_child; - ino_t ino; - int i = filp->f_pos; - switch (i) { - case 0: - ino = dentry->d_inode->i_ino; - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - break; - filp->f_pos++; - i++; - /* fallthrough */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) - break; - filp->f_pos++; - i++; - /* fallthrough */ - default: - spin_lock(&dentry->d_lock); - if (filp->f_pos == 2) - list_move(q, &dentry->d_subdirs); - - for (p=q->next; p != &dentry->d_subdirs; p=p->next) { - struct dentry *next; - next = list_entry(p, struct dentry, d_u.d_child); - spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); - if (!simple_positive(next)) { - spin_unlock(&next->d_lock); - continue; - } + if (!dir_emit_dots(file, ctx)) + return 0; + spin_lock(&dentry->d_lock); + if (ctx->pos == 2) + list_move(q, &dentry->d_subdirs); + + for (p = q->next; p != &dentry->d_subdirs; p = p->next) { + struct dentry *next = list_entry(p, struct dentry, d_u.d_child); + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + if (!simple_positive(next)) { + spin_unlock(&next->d_lock); + continue; + } - spin_unlock(&next->d_lock); - spin_unlock(&dentry->d_lock); - if (filldir(dirent, next->d_name.name, - next->d_name.len, filp->f_pos, - next->d_inode->i_ino, - dt_type(next->d_inode)) < 0) - return 0; - spin_lock(&dentry->d_lock); - spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); - /* next is still alive */ - list_move(q, p); - spin_unlock(&next->d_lock); - p = q; - filp->f_pos++; - } - spin_unlock(&dentry->d_lock); + spin_unlock(&next->d_lock); + spin_unlock(&dentry->d_lock); + if (!dir_emit(ctx, next->d_name.name, next->d_name.len, + next->d_inode->i_ino, dt_type(next->d_inode))) + return 0; + spin_lock(&dentry->d_lock); + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + /* next is still alive */ + list_move(q, p); + spin_unlock(&next->d_lock); + p = q; + ctx->pos++; } + spin_unlock(&dentry->d_lock); return 0; } @@ -202,7 +182,7 @@ const struct file_operations simple_dir_operations = { .release = dcache_dir_close, .llseek = dcache_dir_lseek, .read = generic_read_dir, - .readdir = dcache_readdir, + .iterate = dcache_readdir, .fsync = noop_fsync, }; diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 0796c45d0d4d..01bfe7662751 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -144,6 +144,9 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) timeout); if (ret < 0) return -ERESTARTSYS; + /* Reset the lock status after a server reboot so we resend */ + if (block->b_status == nlm_lck_denied_grace_period) + block->b_status = nlm_lck_blocked; req->a_res.status = block->b_status; return 0; } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 7e529c3c45c0..9760ecb9b60f 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -550,9 +550,6 @@ again: status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT); if (status < 0) break; - /* Resend the blocking lock request after a server reboot */ - if (resp->status == nlm_lck_denied_grace_period) - continue; if (resp->status != nlm_lck_blocked) break; } diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index a2aa97d45670..10d6c41aecad 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -305,7 +305,7 @@ static int lockd_start_svc(struct svc_serv *serv) svc_sock_update_bufs(serv); serv->sv_maxconn = nlm_max_connections; - nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); + nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name); if (IS_ERR(nlmsvc_task)) { error = PTR_ERR(nlmsvc_task); printk(KERN_WARNING diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index e703318c41df..067778b0ccc9 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -276,7 +276,7 @@ static int nlmsvc_unlink_block(struct nlm_block *block) dprintk("lockd: unlinking block %p...\n", block); /* Remove block from list */ - status = posix_unblock_lock(block->b_file->f_file, &block->b_call->a_args.lock.fl); + status = posix_unblock_lock(&block->b_call->a_args.lock.fl); nlmsvc_remove_block(block); return status; } @@ -744,8 +744,20 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2) return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid; } +/* + * Since NLM uses two "keys" for tracking locks, we need to hash them down + * to one for the blocked_hash. Here, we're just xor'ing the host address + * with the pid in order to create a key value for picking a hash bucket. + */ +static unsigned long +nlmsvc_owner_key(struct file_lock *fl) +{ + return (unsigned long)fl->fl_owner ^ (unsigned long)fl->fl_pid; +} + const struct lock_manager_operations nlmsvc_lock_operations = { .lm_compare_owner = nlmsvc_same_owner, + .lm_owner_key = nlmsvc_owner_key, .lm_notify = nlmsvc_notify_blocked, .lm_grant = nlmsvc_grant_deferred, }; diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 97e87415b145..dc5c75930f0f 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -169,7 +169,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, again: file->f_locks = 0; - lock_flocks(); /* protects i_flock list */ + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl; fl = fl->fl_next) { if (fl->fl_lmops != &nlmsvc_lock_operations) continue; @@ -181,7 +181,7 @@ again: if (match(lockhost, host)) { struct file_lock lock = *fl; - unlock_flocks(); + spin_unlock(&inode->i_lock); lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; @@ -193,7 +193,7 @@ again: goto again; } } - unlock_flocks(); + spin_unlock(&inode->i_lock); return 0; } @@ -228,14 +228,14 @@ nlm_file_inuse(struct nlm_file *file) if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) return 1; - lock_flocks(); + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl; fl = fl->fl_next) { if (fl->fl_lmops == &nlmsvc_lock_operations) { - unlock_flocks(); + spin_unlock(&inode->i_lock); return 1; } } - unlock_flocks(); + spin_unlock(&inode->i_lock); file->f_locks = 0; return 0; } diff --git a/fs/locks.c b/fs/locks.c index cb424a4fed71..b27a3005d78d 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -126,6 +126,9 @@ #include <linux/time.h> #include <linux/rcupdate.h> #include <linux/pid_namespace.h> +#include <linux/hashtable.h> +#include <linux/percpu.h> +#include <linux/lglock.h> #include <asm/uaccess.h> @@ -153,30 +156,53 @@ int lease_break_time = 45; #define for_each_lock(inode, lockp) \ for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next) -static LIST_HEAD(file_lock_list); -static LIST_HEAD(blocked_list); -static DEFINE_SPINLOCK(file_lock_lock); +/* + * The global file_lock_list is only used for displaying /proc/locks, so we + * keep a list on each CPU, with each list protected by its own spinlock via + * the file_lock_lglock. Note that alterations to the list also require that + * the relevant i_lock is held. + */ +DEFINE_STATIC_LGLOCK(file_lock_lglock); +static DEFINE_PER_CPU(struct hlist_head, file_lock_list); /* - * Protects the two list heads above, plus the inode->i_flock list + * The blocked_hash is used to find POSIX lock loops for deadlock detection. + * It is protected by blocked_lock_lock. + * + * We hash locks by lockowner in order to optimize searching for the lock a + * particular lockowner is waiting on. + * + * FIXME: make this value scale via some heuristic? We generally will want more + * buckets when we have more lockowners holding locks, but that's a little + * difficult to determine without knowing what the workload will look like. */ -void lock_flocks(void) -{ - spin_lock(&file_lock_lock); -} -EXPORT_SYMBOL_GPL(lock_flocks); +#define BLOCKED_HASH_BITS 7 +static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS); -void unlock_flocks(void) -{ - spin_unlock(&file_lock_lock); -} -EXPORT_SYMBOL_GPL(unlock_flocks); +/* + * This lock protects the blocked_hash. Generally, if you're accessing it, you + * want to be holding this lock. + * + * In addition, it also protects the fl->fl_block list, and the fl->fl_next + * pointer for file_lock structures that are acting as lock requests (in + * contrast to those that are acting as records of acquired locks). + * + * Note that when we acquire this lock in order to change the above fields, + * we often hold the i_lock as well. In certain cases, when reading the fields + * protected by this lock, we can skip acquiring it iff we already hold the + * i_lock. + * + * In particular, adding an entry to the fl_block list requires that you hold + * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting + * an entry from the list however only requires the file_lock_lock. + */ +static DEFINE_SPINLOCK(blocked_lock_lock); static struct kmem_cache *filelock_cache __read_mostly; static void locks_init_lock_heads(struct file_lock *fl) { - INIT_LIST_HEAD(&fl->fl_link); + INIT_HLIST_NODE(&fl->fl_link); INIT_LIST_HEAD(&fl->fl_block); init_waitqueue_head(&fl->fl_wait); } @@ -210,7 +236,7 @@ void locks_free_lock(struct file_lock *fl) { BUG_ON(waitqueue_active(&fl->fl_wait)); BUG_ON(!list_empty(&fl->fl_block)); - BUG_ON(!list_empty(&fl->fl_link)); + BUG_ON(!hlist_unhashed(&fl->fl_link)); locks_release_private(fl); kmem_cache_free(filelock_cache, fl); @@ -484,47 +510,118 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) return fl1->fl_owner == fl2->fl_owner; } +/* Must be called with the i_lock held! */ +static inline void +locks_insert_global_locks(struct file_lock *fl) +{ + lg_local_lock(&file_lock_lglock); + fl->fl_link_cpu = smp_processor_id(); + hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list)); + lg_local_unlock(&file_lock_lglock); +} + +/* Must be called with the i_lock held! */ +static inline void +locks_delete_global_locks(struct file_lock *fl) +{ + /* + * Avoid taking lock if already unhashed. This is safe since this check + * is done while holding the i_lock, and new insertions into the list + * also require that it be held. + */ + if (hlist_unhashed(&fl->fl_link)) + return; + lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu); + hlist_del_init(&fl->fl_link); + lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu); +} + +static unsigned long +posix_owner_key(struct file_lock *fl) +{ + if (fl->fl_lmops && fl->fl_lmops->lm_owner_key) + return fl->fl_lmops->lm_owner_key(fl); + return (unsigned long)fl->fl_owner; +} + +static inline void +locks_insert_global_blocked(struct file_lock *waiter) +{ + hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter)); +} + +static inline void +locks_delete_global_blocked(struct file_lock *waiter) +{ + hash_del(&waiter->fl_link); +} + /* Remove waiter from blocker's block list. * When blocker ends up pointing to itself then the list is empty. + * + * Must be called with blocked_lock_lock held. */ static void __locks_delete_block(struct file_lock *waiter) { + locks_delete_global_blocked(waiter); list_del_init(&waiter->fl_block); - list_del_init(&waiter->fl_link); waiter->fl_next = NULL; } -/* - */ -void locks_delete_block(struct file_lock *waiter) +static void locks_delete_block(struct file_lock *waiter) { - lock_flocks(); + spin_lock(&blocked_lock_lock); __locks_delete_block(waiter); - unlock_flocks(); + spin_unlock(&blocked_lock_lock); } -EXPORT_SYMBOL(locks_delete_block); /* Insert waiter into blocker's block list. * We use a circular list so that processes can be easily woken up in * the order they blocked. The documentation doesn't require this but * it seems like the reasonable thing to do. + * + * Must be called with both the i_lock and blocked_lock_lock held. The fl_block + * list itself is protected by the file_lock_list, but by ensuring that the + * i_lock is also held on insertions we can avoid taking the blocked_lock_lock + * in some cases when we see that the fl_block list is empty. */ -static void locks_insert_block(struct file_lock *blocker, - struct file_lock *waiter) +static void __locks_insert_block(struct file_lock *blocker, + struct file_lock *waiter) { BUG_ON(!list_empty(&waiter->fl_block)); - list_add_tail(&waiter->fl_block, &blocker->fl_block); waiter->fl_next = blocker; + list_add_tail(&waiter->fl_block, &blocker->fl_block); if (IS_POSIX(blocker)) - list_add(&waiter->fl_link, &blocked_list); + locks_insert_global_blocked(waiter); } -/* Wake up processes blocked waiting for blocker. - * If told to wait then schedule the processes until the block list - * is empty, otherwise empty the block list ourselves. +/* Must be called with i_lock held. */ +static void locks_insert_block(struct file_lock *blocker, + struct file_lock *waiter) +{ + spin_lock(&blocked_lock_lock); + __locks_insert_block(blocker, waiter); + spin_unlock(&blocked_lock_lock); +} + +/* + * Wake up processes blocked waiting for blocker. + * + * Must be called with the inode->i_lock held! */ static void locks_wake_up_blocks(struct file_lock *blocker) { + /* + * Avoid taking global lock if list is empty. This is safe since new + * blocked requests are only added to the list under the i_lock, and + * the i_lock is always held here. Note that removal from the fl_block + * list does not require the i_lock, so we must recheck list_empty() + * after acquiring the blocked_lock_lock. + */ + if (list_empty(&blocker->fl_block)) + return; + + spin_lock(&blocked_lock_lock); while (!list_empty(&blocker->fl_block)) { struct file_lock *waiter; @@ -536,20 +633,23 @@ static void locks_wake_up_blocks(struct file_lock *blocker) else wake_up(&waiter->fl_wait); } + spin_unlock(&blocked_lock_lock); } /* Insert file lock fl into an inode's lock list at the position indicated * by pos. At the same time add the lock to the global file lock list. + * + * Must be called with the i_lock held! */ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) { - list_add(&fl->fl_link, &file_lock_list); - fl->fl_nspid = get_pid(task_tgid(current)); /* insert into file's list */ fl->fl_next = *pos; *pos = fl; + + locks_insert_global_locks(fl); } /* @@ -557,14 +657,17 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) * Wake up processes that are blocked waiting for this lock, * notify the FS that the lock has been cleared and * finally free the lock. + * + * Must be called with the i_lock held! */ static void locks_delete_lock(struct file_lock **thisfl_p) { struct file_lock *fl = *thisfl_p; + locks_delete_global_locks(fl); + *thisfl_p = fl->fl_next; fl->fl_next = NULL; - list_del_init(&fl->fl_link); if (fl->fl_nspid) { put_pid(fl->fl_nspid); @@ -625,8 +728,9 @@ void posix_test_lock(struct file *filp, struct file_lock *fl) { struct file_lock *cfl; + struct inode *inode = file_inode(filp); - lock_flocks(); + spin_lock(&inode->i_lock); for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) { if (!IS_POSIX(cfl)) continue; @@ -639,7 +743,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) fl->fl_pid = pid_vnr(cfl->fl_nspid); } else fl->fl_type = F_UNLCK; - unlock_flocks(); + spin_unlock(&inode->i_lock); return; } EXPORT_SYMBOL(posix_test_lock); @@ -676,13 +780,14 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl) { struct file_lock *fl; - list_for_each_entry(fl, &blocked_list, fl_link) { + hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) { if (posix_same_owner(fl, block_fl)) return fl->fl_next; } return NULL; } +/* Must be called with the blocked_lock_lock held! */ static int posix_locks_deadlock(struct file_lock *caller_fl, struct file_lock *block_fl) { @@ -718,7 +823,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) return -ENOMEM; } - lock_flocks(); + spin_lock(&inode->i_lock); if (request->fl_flags & FL_ACCESS) goto find_conflict; @@ -748,9 +853,9 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) * give it the opportunity to lock the file. */ if (found) { - unlock_flocks(); + spin_unlock(&inode->i_lock); cond_resched(); - lock_flocks(); + spin_lock(&inode->i_lock); } find_conflict: @@ -777,7 +882,7 @@ find_conflict: error = 0; out: - unlock_flocks(); + spin_unlock(&inode->i_lock); if (new_fl) locks_free_lock(new_fl); return error; @@ -791,7 +896,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str struct file_lock *left = NULL; struct file_lock *right = NULL; struct file_lock **before; - int error, added = 0; + int error; + bool added = false; /* * We may need two file_lock structures for this operation, @@ -806,7 +912,12 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str new_fl2 = locks_alloc_lock(); } - lock_flocks(); + spin_lock(&inode->i_lock); + /* + * New lock request. Walk all POSIX locks and look for conflicts. If + * there are any, either return error or put the request on the + * blocker's list of waiters and the global blocked_hash. + */ if (request->fl_type != F_UNLCK) { for_each_lock(inode, before) { fl = *before; @@ -819,11 +930,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str error = -EAGAIN; if (!(request->fl_flags & FL_SLEEP)) goto out; + /* + * Deadlock detection and insertion into the blocked + * locks list must be done while holding the same lock! + */ error = -EDEADLK; - if (posix_locks_deadlock(request, fl)) - goto out; - error = FILE_LOCK_DEFERRED; - locks_insert_block(fl, request); + spin_lock(&blocked_lock_lock); + if (likely(!posix_locks_deadlock(request, fl))) { + error = FILE_LOCK_DEFERRED; + __locks_insert_block(fl, request); + } + spin_unlock(&blocked_lock_lock); goto out; } } @@ -845,7 +962,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str before = &fl->fl_next; } - /* Process locks with this owner. */ + /* Process locks with this owner. */ while ((fl = *before) && posix_same_owner(request, fl)) { /* Detect adjacent or overlapping regions (if same lock type) */ @@ -880,7 +997,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str continue; } request = fl; - added = 1; + added = true; } else { /* Processing for different lock types is a bit @@ -891,7 +1008,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str if (fl->fl_start > request->fl_end) break; if (request->fl_type == F_UNLCK) - added = 1; + added = true; if (fl->fl_start < request->fl_start) left = fl; /* If the next lock in the list has a higher end @@ -921,7 +1038,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str locks_release_private(fl); locks_copy_private(fl, request); request = fl; - added = 1; + added = true; } } /* Go on to next lock. @@ -931,10 +1048,9 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str } /* - * The above code only modifies existing locks in case of - * merging or replacing. If new lock(s) need to be inserted - * all modifications are done bellow this, so it's safe yet to - * bail out. + * The above code only modifies existing locks in case of merging or + * replacing. If new lock(s) need to be inserted all modifications are + * done below this, so it's safe yet to bail out. */ error = -ENOLCK; /* "no luck" */ if (right && left == right && !new_fl2) @@ -974,7 +1090,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str locks_wake_up_blocks(left); } out: - unlock_flocks(); + spin_unlock(&inode->i_lock); /* * Free any unused locks. */ @@ -1049,14 +1165,14 @@ int locks_mandatory_locked(struct inode *inode) /* * Search the lock list for this inode for any POSIX locks. */ - lock_flocks(); + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!IS_POSIX(fl)) continue; if (fl->fl_owner != owner) break; } - unlock_flocks(); + spin_unlock(&inode->i_lock); return fl ? -EAGAIN : 0; } @@ -1199,7 +1315,7 @@ int __break_lease(struct inode *inode, unsigned int mode) if (IS_ERR(new_fl)) return PTR_ERR(new_fl); - lock_flocks(); + spin_lock(&inode->i_lock); time_out_leases(inode); @@ -1249,11 +1365,11 @@ restart: break_time++; } locks_insert_block(flock, new_fl); - unlock_flocks(); + spin_unlock(&inode->i_lock); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_next, break_time); - lock_flocks(); - __locks_delete_block(new_fl); + spin_lock(&inode->i_lock); + locks_delete_block(new_fl); if (error >= 0) { if (error == 0) time_out_leases(inode); @@ -1270,7 +1386,7 @@ restart: } out: - unlock_flocks(); + spin_unlock(&inode->i_lock); locks_free_lock(new_fl); return error; } @@ -1323,9 +1439,10 @@ EXPORT_SYMBOL(lease_get_mtime); int fcntl_getlease(struct file *filp) { struct file_lock *fl; + struct inode *inode = file_inode(filp); int type = F_UNLCK; - lock_flocks(); + spin_lock(&inode->i_lock); time_out_leases(file_inode(filp)); for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl); fl = fl->fl_next) { @@ -1334,11 +1451,11 @@ int fcntl_getlease(struct file *filp) break; } } - unlock_flocks(); + spin_unlock(&inode->i_lock); return type; } -int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) +static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) { struct file_lock *fl, **before, **my_before = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; @@ -1351,7 +1468,7 @@ int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) goto out; if ((arg == F_WRLCK) - && ((dentry->d_count > 1) + && ((d_count(dentry) > 1) || (atomic_read(&inode->i_count) > 1))) goto out; @@ -1403,7 +1520,7 @@ out: return error; } -int generic_delete_lease(struct file *filp, struct file_lock **flp) +static int generic_delete_lease(struct file *filp, struct file_lock **flp) { struct file_lock *fl, **before; struct dentry *dentry = filp->f_path.dentry; @@ -1428,7 +1545,7 @@ int generic_delete_lease(struct file *filp, struct file_lock **flp) * The (input) flp->fl_lmops->lm_break function is required * by break_lease(). * - * Called with file_lock_lock held. + * Called with inode->i_lock held. */ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) { @@ -1497,11 +1614,12 @@ static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease) int vfs_setlease(struct file *filp, long arg, struct file_lock **lease) { + struct inode *inode = file_inode(filp); int error; - lock_flocks(); + spin_lock(&inode->i_lock); error = __vfs_setlease(filp, arg, lease); - unlock_flocks(); + spin_unlock(&inode->i_lock); return error; } @@ -1519,6 +1637,7 @@ static int do_fcntl_delete_lease(struct file *filp) static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) { struct file_lock *fl, *ret; + struct inode *inode = file_inode(filp); struct fasync_struct *new; int error; @@ -1532,10 +1651,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) return -ENOMEM; } ret = fl; - lock_flocks(); + spin_lock(&inode->i_lock); error = __vfs_setlease(filp, arg, &ret); if (error) { - unlock_flocks(); + spin_unlock(&inode->i_lock); locks_free_lock(fl); goto out_free_fasync; } @@ -1552,7 +1671,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) new = NULL; error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); - unlock_flocks(); + spin_unlock(&inode->i_lock); out_free_fasync: if (new) @@ -2076,7 +2195,7 @@ void locks_remove_flock(struct file *filp) fl.fl_ops->fl_release_private(&fl); } - lock_flocks(); + spin_lock(&inode->i_lock); before = &inode->i_flock; while ((fl = *before) != NULL) { @@ -2094,30 +2213,28 @@ void locks_remove_flock(struct file *filp) } before = &fl->fl_next; } - unlock_flocks(); + spin_unlock(&inode->i_lock); } /** * posix_unblock_lock - stop waiting for a file lock - * @filp: how the file was opened * @waiter: the lock which was waiting * * lockd needs to block waiting for locks. */ int -posix_unblock_lock(struct file *filp, struct file_lock *waiter) +posix_unblock_lock(struct file_lock *waiter) { int status = 0; - lock_flocks(); + spin_lock(&blocked_lock_lock); if (waiter->fl_next) __locks_delete_block(waiter); else status = -ENOENT; - unlock_flocks(); + spin_unlock(&blocked_lock_lock); return status; } - EXPORT_SYMBOL(posix_unblock_lock); /** @@ -2140,6 +2257,11 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock); #include <linux/proc_fs.h> #include <linux/seq_file.h> +struct locks_iterator { + int li_cpu; + loff_t li_pos; +}; + static void lock_get_status(struct seq_file *f, struct file_lock *fl, loff_t id, char *pfx) { @@ -2213,37 +2335,41 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, static int locks_show(struct seq_file *f, void *v) { + struct locks_iterator *iter = f->private; struct file_lock *fl, *bfl; - fl = list_entry(v, struct file_lock, fl_link); + fl = hlist_entry(v, struct file_lock, fl_link); - lock_get_status(f, fl, *((loff_t *)f->private), ""); + lock_get_status(f, fl, iter->li_pos, ""); list_for_each_entry(bfl, &fl->fl_block, fl_block) - lock_get_status(f, bfl, *((loff_t *)f->private), " ->"); + lock_get_status(f, bfl, iter->li_pos, " ->"); return 0; } static void *locks_start(struct seq_file *f, loff_t *pos) { - loff_t *p = f->private; + struct locks_iterator *iter = f->private; - lock_flocks(); - *p = (*pos + 1); - return seq_list_start(&file_lock_list, *pos); + iter->li_pos = *pos + 1; + lg_global_lock(&file_lock_lglock); + spin_lock(&blocked_lock_lock); + return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos); } static void *locks_next(struct seq_file *f, void *v, loff_t *pos) { - loff_t *p = f->private; - ++*p; - return seq_list_next(v, &file_lock_list, pos); + struct locks_iterator *iter = f->private; + + ++iter->li_pos; + return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos); } static void locks_stop(struct seq_file *f, void *v) { - unlock_flocks(); + spin_unlock(&blocked_lock_lock); + lg_global_unlock(&file_lock_lglock); } static const struct seq_operations locks_seq_operations = { @@ -2255,7 +2381,8 @@ static const struct seq_operations locks_seq_operations = { static int locks_open(struct inode *inode, struct file *filp) { - return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t)); + return seq_open_private(filp, &locks_seq_operations, + sizeof(struct locks_iterator)); } static const struct file_operations proc_locks_operations = { @@ -2290,7 +2417,8 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len) { struct file_lock *fl; int result = 1; - lock_flocks(); + + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (IS_POSIX(fl)) { if (fl->fl_type == F_RDLCK) @@ -2307,7 +2435,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len) result = 0; break; } - unlock_flocks(); + spin_unlock(&inode->i_lock); return result; } @@ -2330,7 +2458,8 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len) { struct file_lock *fl; int result = 1; - lock_flocks(); + + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (IS_POSIX(fl)) { if ((fl->fl_end < start) || (fl->fl_start > (start + len))) @@ -2345,7 +2474,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len) result = 0; break; } - unlock_flocks(); + spin_unlock(&inode->i_lock); return result; } @@ -2353,9 +2482,16 @@ EXPORT_SYMBOL(lock_may_write); static int __init filelock_init(void) { + int i; + filelock_cache = kmem_cache_create("file_lock_cache", sizeof(struct file_lock), 0, SLAB_PANIC, NULL); + lg_lock_init(&file_lock_lglock, "file_lock_lglock"); + + for_each_possible_cpu(i) + INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i)); + return 0; } diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index e784a217b500..550475ca6a0e 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -32,7 +32,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw) bio_vec.bv_len = PAGE_SIZE; bio_vec.bv_offset = 0; bio.bi_vcnt = 1; - bio.bi_idx = 0; bio.bi_size = PAGE_SIZE; bio.bi_bdev = bdev; bio.bi_sector = page->index * (PAGE_SIZE >> 9); @@ -108,7 +107,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, if (i >= max_pages) { /* Block layer cannot split bios :( */ bio->bi_vcnt = i; - bio->bi_idx = 0; bio->bi_size = i * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; @@ -136,7 +134,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, unlock_page(page); } bio->bi_vcnt = nr_pages; - bio->bi_idx = 0; bio->bi_size = nr_pages * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; @@ -202,7 +199,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, if (i >= max_pages) { /* Block layer cannot split bios :( */ bio->bi_vcnt = i; - bio->bi_idx = 0; bio->bi_size = i * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; @@ -224,7 +220,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_io_vec[i].bv_offset = 0; } bio->bi_vcnt = nr_pages; - bio->bi_idx = 0; bio->bi_size = nr_pages * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index b82751082112..6bdc347008f5 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -281,17 +281,23 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry) /* FIXME: readdir currently has it's own dir_walk code. I don't see a good * way to combine the two copies */ -#define IMPLICIT_NODES 2 -static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir) +static int logfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *dir = file_inode(file); - loff_t pos = file->f_pos - IMPLICIT_NODES; + loff_t pos; struct page *page; struct logfs_disk_dentry *dd; - int full; + if (ctx->pos < 0) + return -EINVAL; + + if (!dir_emit_dots(file, ctx)) + return 0; + + pos = ctx->pos - 2; BUG_ON(pos < 0); - for (;; pos++) { + for (;; pos++, ctx->pos++) { + bool full; if (beyond_eof(dir, pos)) break; if (!logfs_exist_block(dir, pos)) { @@ -306,42 +312,17 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir) dd = kmap(page); BUG_ON(dd->namelen == 0); - full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen), - pos, be64_to_cpu(dd->ino), dd->type); + full = !dir_emit(ctx, (char *)dd->name, + be16_to_cpu(dd->namelen), + be64_to_cpu(dd->ino), dd->type); kunmap(page); page_cache_release(page); if (full) break; } - - file->f_pos = pos + IMPLICIT_NODES; return 0; } -static int logfs_readdir(struct file *file, void *buf, filldir_t filldir) -{ - struct inode *inode = file_inode(file); - ino_t pino = parent_ino(file->f_dentry); - int err; - - if (file->f_pos < 0) - return -EINVAL; - - if (file->f_pos == 0) { - if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0) - return 0; - file->f_pos++; - } - if (file->f_pos == 1) { - if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0) - return 0; - file->f_pos++; - } - - err = __logfs_readdir(file, buf, filldir); - return err; -} - static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name) { dd->namelen = cpu_to_be16(name->len); @@ -814,7 +795,7 @@ const struct inode_operations logfs_dir_iops = { const struct file_operations logfs_dir_fops = { .fsync = logfs_fsync, .unlocked_ioctl = logfs_ioctl, - .readdir = logfs_readdir, + .iterate = logfs_readdir, .read = generic_read_dir, .llseek = default_llseek, }; diff --git a/fs/logfs/file.c b/fs/logfs/file.c index c2219a6dd3c8..57914fc32b62 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -159,7 +159,8 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc) return __logfs_writepage(page); } -static void logfs_invalidatepage(struct page *page, unsigned long offset) +static void logfs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct logfs_block *block = logfs_block(page); diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 038da0991794..d448a777166b 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -884,7 +884,8 @@ static struct logfs_area *alloc_area(struct super_block *sb) return area; } -static void map_invalidatepage(struct page *page, unsigned long l) +static void map_invalidatepage(struct page *page, unsigned int o, + unsigned int l) { return; } diff --git a/fs/minix/dir.c b/fs/minix/dir.c index a9ed6f36e6ea..dfaf6fa9b7b5 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -16,12 +16,12 @@ typedef struct minix_dir_entry minix_dirent; typedef struct minix3_dir_entry minix3_dirent; -static int minix_readdir(struct file *, void *, filldir_t); +static int minix_readdir(struct file *, struct dir_context *); const struct file_operations minix_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = minix_readdir, + .iterate = minix_readdir, .fsync = generic_file_fsync, }; @@ -82,22 +82,23 @@ static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) return (void*)((char*)de + sbi->s_dirsize); } -static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) +static int minix_readdir(struct file *file, struct dir_context *ctx) { - unsigned long pos = filp->f_pos; - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - unsigned offset = pos & ~PAGE_CACHE_MASK; - unsigned long n = pos >> PAGE_CACHE_SHIFT; - unsigned long npages = dir_pages(inode); struct minix_sb_info *sbi = minix_sb(sb); unsigned chunk_size = sbi->s_dirsize; - char *name; - __u32 inumber; + unsigned long npages = dir_pages(inode); + unsigned long pos = ctx->pos; + unsigned offset; + unsigned long n; - pos = (pos + chunk_size-1) & ~(chunk_size-1); + ctx->pos = pos = ALIGN(pos, chunk_size); if (pos >= inode->i_size) - goto done; + return 0; + + offset = pos & ~PAGE_CACHE_MASK; + n = pos >> PAGE_CACHE_SHIFT; for ( ; n < npages; n++, offset = 0) { char *p, *kaddr, *limit; @@ -109,6 +110,8 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) p = kaddr+offset; limit = kaddr + minix_last_byte(inode, n) - chunk_size; for ( ; p <= limit; p = minix_next_entry(p, sbi)) { + const char *name; + __u32 inumber; if (sbi->s_version == MINIX_V3) { minix3_dirent *de3 = (minix3_dirent *)p; name = de3->name; @@ -119,24 +122,17 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) inumber = de->inode; } if (inumber) { - int over; - unsigned l = strnlen(name, sbi->s_namelen); - offset = p - kaddr; - over = filldir(dirent, name, l, - (n << PAGE_CACHE_SHIFT) | offset, - inumber, DT_UNKNOWN); - if (over) { + if (!dir_emit(ctx, name, l, + inumber, DT_UNKNOWN)) { dir_put_page(page); - goto done; + return 0; } } + ctx->pos += chunk_size; } dir_put_page(page); } - -done: - filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset; return 0; } diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 0db73d9dd668..cd950e2331b6 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -54,6 +54,18 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, return error; } +static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + int error; + struct inode *inode = minix_new_inode(dir, mode, &error); + if (inode) { + minix_set_inode(inode, 0); + mark_inode_dirty(inode); + d_tmpfile(dentry, inode); + } + return error; +} + static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -254,4 +266,5 @@ const struct inode_operations minix_dir_inode_operations = { .mknod = minix_mknod, .rename = minix_rename, .getattr = minix_getattr, + .tmpfile = minix_tmpfile, }; diff --git a/fs/mount.h b/fs/mount.h index cd5007980400..64a858143ff9 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -18,6 +18,12 @@ struct mnt_pcp { int mnt_writers; }; +struct mountpoint { + struct list_head m_hash; + struct dentry *m_dentry; + int m_count; +}; + struct mount { struct list_head mnt_hash; struct mount *mnt_parent; @@ -40,6 +46,7 @@ struct mount { struct list_head mnt_slave; /* slave list entry */ struct mount *mnt_master; /* slave is on master->mnt_slave_list */ struct mnt_namespace *mnt_ns; /* containing namespace */ + struct mountpoint *mnt_mp; /* where is it mounted */ #ifdef CONFIG_FSNOTIFY struct hlist_head mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; diff --git a/fs/namei.c b/fs/namei.c index 57ae9c8c66bf..b2beee7a733f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1352,7 +1352,7 @@ static int lookup_fast(struct nameidata *nd, */ if (nd->flags & LOOKUP_RCU) { unsigned seq; - dentry = __d_lookup_rcu(parent, &nd->last, &seq, nd->inode); + dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (!dentry) goto unlazy; @@ -1787,8 +1787,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) struct dentry *parent = nd->path.dentry; nd->flags &= ~LOOKUP_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - err = parent->d_op->d_hash(parent, nd->inode, - &this); + err = parent->d_op->d_hash(parent, &this); if (err < 0) break; } @@ -1976,7 +1975,7 @@ static int path_lookupat(int dfd, const char *name, err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) { - if (!nd->inode->i_op->lookup) { + if (!can_lookup(nd->inode)) { path_put(&nd->path); err = -ENOTDIR; } @@ -2121,7 +2120,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) * to use its own hash.. */ if (base->d_flags & DCACHE_OP_HASH) { - int err = base->d_op->d_hash(base, base->d_inode, &this); + int err = base->d_op->d_hash(base, &this); if (err < 0) return ERR_PTR(err); } @@ -2690,28 +2689,10 @@ static int do_last(struct nameidata *nd, struct path *path, nd->flags &= ~LOOKUP_PARENT; nd->flags |= op->intent; - switch (nd->last_type) { - case LAST_DOTDOT: - case LAST_DOT: + if (nd->last_type != LAST_NORM) { error = handle_dots(nd, nd->last_type); if (error) return error; - /* fallthrough */ - case LAST_ROOT: - error = complete_walk(nd); - if (error) - return error; - audit_inode(name, nd->path.dentry, 0); - if (open_flag & O_CREAT) { - error = -EISDIR; - goto out; - } - goto finish_open; - case LAST_BIND: - error = complete_walk(nd); - if (error) - return error; - audit_inode(name, dir, 0); goto finish_open; } @@ -2740,7 +2721,7 @@ static int do_last(struct nameidata *nd, struct path *path, if (error) return error; - audit_inode(name, dir, 0); + audit_inode(name, dir, LOOKUP_PARENT); error = -EISDIR; /* trailing slashes? */ if (nd->last.name[nd->last.len]) @@ -2841,19 +2822,19 @@ finish_lookup: } nd->inode = inode; /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ +finish_open: error = complete_walk(nd); if (error) { path_put(&save_parent); return error; } + audit_inode(name, nd->path.dentry, 0); error = -EISDIR; if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) goto out; error = -ENOTDIR; - if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup) + if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode)) goto out; - audit_inode(name, nd->path.dentry, 0); -finish_open: if (!S_ISREG(nd->inode->i_mode)) will_truncate = false; @@ -2920,6 +2901,67 @@ stale_open: goto retry_lookup; } +static int do_tmpfile(int dfd, struct filename *pathname, + struct nameidata *nd, int flags, + const struct open_flags *op, + struct file *file, int *opened) +{ + static const struct qstr name = QSTR_INIT("/", 1); + struct dentry *dentry, *child; + struct inode *dir; + int error = path_lookupat(dfd, pathname->name, + flags | LOOKUP_DIRECTORY, nd); + if (unlikely(error)) + return error; + error = mnt_want_write(nd->path.mnt); + if (unlikely(error)) + goto out; + /* we want directory to be writable */ + error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC); + if (error) + goto out2; + dentry = nd->path.dentry; + dir = dentry->d_inode; + if (!dir->i_op->tmpfile) { + error = -EOPNOTSUPP; + goto out2; + } + child = d_alloc(dentry, &name); + if (unlikely(!child)) { + error = -ENOMEM; + goto out2; + } + nd->flags &= ~LOOKUP_DIRECTORY; + nd->flags |= op->intent; + dput(nd->path.dentry); + nd->path.dentry = child; + error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode); + if (error) + goto out2; + audit_inode(pathname, nd->path.dentry, 0); + error = may_open(&nd->path, op->acc_mode, op->open_flag); + if (error) + goto out2; + file->f_path.mnt = nd->path.mnt; + error = finish_open(file, nd->path.dentry, NULL, opened); + if (error) + goto out2; + error = open_check_o_direct(file); + if (error) { + fput(file); + } else if (!(op->open_flag & O_EXCL)) { + struct inode *inode = file_inode(file); + spin_lock(&inode->i_lock); + inode->i_state |= I_LINKABLE; + spin_unlock(&inode->i_lock); + } +out2: + mnt_drop_write(nd->path.mnt); +out: + path_put(&nd->path); + return error; +} + static struct file *path_openat(int dfd, struct filename *pathname, struct nameidata *nd, const struct open_flags *op, int flags) { @@ -2935,6 +2977,11 @@ static struct file *path_openat(int dfd, struct filename *pathname, file->f_flags = op->open_flag; + if (unlikely(file->f_flags & O_TMPFILE)) { + error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened); + goto out; + } + error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base); if (unlikely(error)) goto out; @@ -2987,9 +3034,10 @@ out: } struct file *do_filp_open(int dfd, struct filename *pathname, - const struct open_flags *op, int flags) + const struct open_flags *op) { struct nameidata nd; + int flags = op->lookup_flags; struct file *filp; filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); @@ -3001,17 +3049,16 @@ struct file *do_filp_open(int dfd, struct filename *pathname, } struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, - const char *name, const struct open_flags *op, int flags) + const char *name, const struct open_flags *op) { struct nameidata nd; struct file *file; struct filename filename = { .name = name }; + int flags = op->lookup_flags | LOOKUP_ROOT; nd.root.mnt = mnt; nd.root.dentry = dentry; - flags |= LOOKUP_ROOT; - if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); @@ -3586,12 +3633,18 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de mutex_lock(&inode->i_mutex); /* Make sure we don't allow creating hardlink to an unlinked file */ - if (inode->i_nlink == 0) + if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; else error = dir->i_op->link(old_dentry, dir, new_dentry); + + if (!error && (inode->i_state & I_LINKABLE)) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_LINKABLE; + spin_unlock(&inode->i_lock); + } mutex_unlock(&inode->i_mutex); if (!error) fsnotify_link(dir, inode, new_dentry); diff --git a/fs/namespace.c b/fs/namespace.c index 341d3f564082..7b1ca9ba0b0a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -21,7 +21,8 @@ #include <linux/fs_struct.h> /* get_fs_root et.al. */ #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ #include <linux/uaccess.h> -#include <linux/proc_fs.h> +#include <linux/proc_ns.h> +#include <linux/magic.h> #include "pnode.h" #include "internal.h" @@ -36,6 +37,7 @@ static int mnt_id_start = 0; static int mnt_group_start = 1; static struct list_head *mount_hashtable __read_mostly; +static struct list_head *mountpoint_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; static struct rw_semaphore namespace_sem; @@ -605,6 +607,51 @@ struct vfsmount *lookup_mnt(struct path *path) } } +static struct mountpoint *new_mountpoint(struct dentry *dentry) +{ + struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry); + struct mountpoint *mp; + + list_for_each_entry(mp, chain, m_hash) { + if (mp->m_dentry == dentry) { + /* might be worth a WARN_ON() */ + if (d_unlinked(dentry)) + return ERR_PTR(-ENOENT); + mp->m_count++; + return mp; + } + } + + mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!mp) + return ERR_PTR(-ENOMEM); + + spin_lock(&dentry->d_lock); + if (d_unlinked(dentry)) { + spin_unlock(&dentry->d_lock); + kfree(mp); + return ERR_PTR(-ENOENT); + } + dentry->d_flags |= DCACHE_MOUNTED; + spin_unlock(&dentry->d_lock); + mp->m_dentry = dentry; + mp->m_count = 1; + list_add(&mp->m_hash, chain); + return mp; +} + +static void put_mountpoint(struct mountpoint *mp) +{ + if (!--mp->m_count) { + struct dentry *dentry = mp->m_dentry; + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_MOUNTED; + spin_unlock(&dentry->d_lock); + list_del(&mp->m_hash); + kfree(mp); + } +} + static inline int check_mnt(struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; @@ -633,27 +680,6 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) } /* - * Clear dentry's mounted state if it has no remaining mounts. - * vfsmount_lock must be held for write. - */ -static void dentry_reset_mounted(struct dentry *dentry) -{ - unsigned u; - - for (u = 0; u < HASH_SIZE; u++) { - struct mount *p; - - list_for_each_entry(p, &mount_hashtable[u], mnt_hash) { - if (p->mnt_mountpoint == dentry) - return; - } - } - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_MOUNTED; - spin_unlock(&dentry->d_lock); -} - -/* * vfsmount lock must be held for write */ static void detach_mnt(struct mount *mnt, struct path *old_path) @@ -664,32 +690,35 @@ static void detach_mnt(struct mount *mnt, struct path *old_path) mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); list_del_init(&mnt->mnt_hash); - dentry_reset_mounted(old_path->dentry); + put_mountpoint(mnt->mnt_mp); + mnt->mnt_mp = NULL; } /* * vfsmount lock must be held for write */ -void mnt_set_mountpoint(struct mount *mnt, struct dentry *dentry, +void mnt_set_mountpoint(struct mount *mnt, + struct mountpoint *mp, struct mount *child_mnt) { + mp->m_count++; mnt_add_count(mnt, 1); /* essentially, that's mntget */ - child_mnt->mnt_mountpoint = dget(dentry); + child_mnt->mnt_mountpoint = dget(mp->m_dentry); child_mnt->mnt_parent = mnt; - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_MOUNTED; - spin_unlock(&dentry->d_lock); + child_mnt->mnt_mp = mp; } /* * vfsmount lock must be held for write */ -static void attach_mnt(struct mount *mnt, struct path *path) +static void attach_mnt(struct mount *mnt, + struct mount *parent, + struct mountpoint *mp) { - mnt_set_mountpoint(real_mount(path->mnt), path->dentry, mnt); + mnt_set_mountpoint(parent, mp, mnt); list_add_tail(&mnt->mnt_hash, mount_hashtable + - hash(path->mnt, path->dentry)); - list_add_tail(&mnt->mnt_child, &real_mount(path->mnt)->mnt_mounts); + hash(&parent->mnt, mp->m_dentry)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } /* @@ -1095,11 +1124,23 @@ int may_umount(struct vfsmount *mnt) EXPORT_SYMBOL(may_umount); -void release_mounts(struct list_head *head) +static LIST_HEAD(unmounted); /* protected by namespace_sem */ + +static void namespace_unlock(void) { struct mount *mnt; - while (!list_empty(head)) { - mnt = list_first_entry(head, struct mount, mnt_hash); + LIST_HEAD(head); + + if (likely(list_empty(&unmounted))) { + up_write(&namespace_sem); + return; + } + + list_splice_init(&unmounted, &head); + up_write(&namespace_sem); + + while (!list_empty(&head)) { + mnt = list_first_entry(&head, struct mount, mnt_hash); list_del_init(&mnt->mnt_hash); if (mnt_has_parent(mnt)) { struct dentry *dentry; @@ -1119,11 +1160,16 @@ void release_mounts(struct list_head *head) } } +static inline void namespace_lock(void) +{ + down_write(&namespace_sem); +} + /* * vfsmount lock must be held for write * namespace_sem must be held for write */ -void umount_tree(struct mount *mnt, int propagate, struct list_head *kill) +void umount_tree(struct mount *mnt, int propagate) { LIST_HEAD(tmp_list); struct mount *p; @@ -1142,20 +1188,20 @@ void umount_tree(struct mount *mnt, int propagate, struct list_head *kill) list_del_init(&p->mnt_child); if (mnt_has_parent(p)) { p->mnt_parent->mnt_ghosts++; - dentry_reset_mounted(p->mnt_mountpoint); + put_mountpoint(p->mnt_mp); + p->mnt_mp = NULL; } change_mnt_propagation(p, MS_PRIVATE); } - list_splice(&tmp_list, kill); + list_splice(&tmp_list, &unmounted); } -static void shrink_submounts(struct mount *mnt, struct list_head *umounts); +static void shrink_submounts(struct mount *mnt); static int do_umount(struct mount *mnt, int flags) { struct super_block *sb = mnt->mnt.mnt_sb; int retval; - LIST_HEAD(umount_list); retval = security_sb_umount(&mnt->mnt, flags); if (retval) @@ -1222,22 +1268,21 @@ static int do_umount(struct mount *mnt, int flags) return retval; } - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); event++; if (!(flags & MNT_DETACH)) - shrink_submounts(mnt, &umount_list); + shrink_submounts(mnt); retval = -EBUSY; if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { if (!list_empty(&mnt->mnt_list)) - umount_tree(mnt, 1, &umount_list); + umount_tree(mnt, 1); retval = 0; } br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); - release_mounts(&umount_list); + namespace_unlock(); return retval; } @@ -1310,13 +1355,13 @@ static bool mnt_ns_loop(struct path *path) * mount namespace loop? */ struct inode *inode = path->dentry->d_inode; - struct proc_inode *ei; + struct proc_ns *ei; struct mnt_namespace *mnt_ns; if (!proc_ns_inode(inode)) return false; - ei = PROC_I(inode); + ei = get_proc_ns(inode); if (ei->ns_ops != &mntns_operations) return false; @@ -1327,8 +1372,7 @@ static bool mnt_ns_loop(struct path *path) struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, int flag) { - struct mount *res, *p, *q, *r; - struct path path; + struct mount *res, *p, *q, *r, *parent; if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) return ERR_PTR(-EINVAL); @@ -1355,25 +1399,22 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, q = q->mnt_parent; } p = s; - path.mnt = &q->mnt; - path.dentry = p->mnt_mountpoint; + parent = q; q = clone_mnt(p, p->mnt.mnt_root, flag); if (IS_ERR(q)) goto out; br_write_lock(&vfsmount_lock); list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, &path); + attach_mnt(q, parent, p->mnt_mp); br_write_unlock(&vfsmount_lock); } } return res; out: if (res) { - LIST_HEAD(umount_list); br_write_lock(&vfsmount_lock); - umount_tree(res, 0, &umount_list); + umount_tree(res, 0); br_write_unlock(&vfsmount_lock); - release_mounts(&umount_list); } return q; } @@ -1383,10 +1424,10 @@ out: struct vfsmount *collect_mounts(struct path *path) { struct mount *tree; - down_write(&namespace_sem); + namespace_lock(); tree = copy_tree(real_mount(path->mnt), path->dentry, CL_COPY_ALL | CL_PRIVATE); - up_write(&namespace_sem); + namespace_unlock(); if (IS_ERR(tree)) return NULL; return &tree->mnt; @@ -1394,13 +1435,11 @@ struct vfsmount *collect_mounts(struct path *path) void drop_collected_mounts(struct vfsmount *mnt) { - LIST_HEAD(umount_list); - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); - umount_tree(real_mount(mnt), 0, &umount_list); + umount_tree(real_mount(mnt), 0); br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); - release_mounts(&umount_list); + namespace_unlock(); } int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, @@ -1509,11 +1548,11 @@ static int invent_group_ids(struct mount *mnt, bool recurse) * in allocations. */ static int attach_recursive_mnt(struct mount *source_mnt, - struct path *path, struct path *parent_path) + struct mount *dest_mnt, + struct mountpoint *dest_mp, + struct path *parent_path) { LIST_HEAD(tree_list); - struct mount *dest_mnt = real_mount(path->mnt); - struct dentry *dest_dentry = path->dentry; struct mount *child, *p; int err; @@ -1522,7 +1561,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, if (err) goto out; } - err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list); + err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); if (err) goto out_cleanup_ids; @@ -1534,10 +1573,10 @@ static int attach_recursive_mnt(struct mount *source_mnt, } if (parent_path) { detach_mnt(source_mnt, parent_path); - attach_mnt(source_mnt, path); + attach_mnt(source_mnt, dest_mnt, dest_mp); touch_mnt_namespace(source_mnt->mnt_ns); } else { - mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); commit_tree(source_mnt); } @@ -1556,46 +1595,53 @@ static int attach_recursive_mnt(struct mount *source_mnt, return err; } -static int lock_mount(struct path *path) +static struct mountpoint *lock_mount(struct path *path) { struct vfsmount *mnt; + struct dentry *dentry = path->dentry; retry: - mutex_lock(&path->dentry->d_inode->i_mutex); - if (unlikely(cant_mount(path->dentry))) { - mutex_unlock(&path->dentry->d_inode->i_mutex); - return -ENOENT; + mutex_lock(&dentry->d_inode->i_mutex); + if (unlikely(cant_mount(dentry))) { + mutex_unlock(&dentry->d_inode->i_mutex); + return ERR_PTR(-ENOENT); } - down_write(&namespace_sem); + namespace_lock(); mnt = lookup_mnt(path); - if (likely(!mnt)) - return 0; - up_write(&namespace_sem); + if (likely(!mnt)) { + struct mountpoint *mp = new_mountpoint(dentry); + if (IS_ERR(mp)) { + namespace_unlock(); + mutex_unlock(&dentry->d_inode->i_mutex); + return mp; + } + return mp; + } + namespace_unlock(); mutex_unlock(&path->dentry->d_inode->i_mutex); path_put(path); path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); + dentry = path->dentry = dget(mnt->mnt_root); goto retry; } -static void unlock_mount(struct path *path) +static void unlock_mount(struct mountpoint *where) { - up_write(&namespace_sem); - mutex_unlock(&path->dentry->d_inode->i_mutex); + struct dentry *dentry = where->m_dentry; + put_mountpoint(where); + namespace_unlock(); + mutex_unlock(&dentry->d_inode->i_mutex); } -static int graft_tree(struct mount *mnt, struct path *path) +static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) { if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) return -EINVAL; - if (S_ISDIR(path->dentry->d_inode->i_mode) != + if (S_ISDIR(mp->m_dentry->d_inode->i_mode) != S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) return -ENOTDIR; - if (d_unlinked(path->dentry)) - return -ENOENT; - - return attach_recursive_mnt(mnt, path, NULL); + return attach_recursive_mnt(mnt, p, mp, NULL); } /* @@ -1633,7 +1679,7 @@ static int do_change_type(struct path *path, int flag) if (!type) return -EINVAL; - down_write(&namespace_sem); + namespace_lock(); if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) @@ -1646,7 +1692,7 @@ static int do_change_type(struct path *path, int flag) br_write_unlock(&vfsmount_lock); out_unlock: - up_write(&namespace_sem); + namespace_unlock(); return err; } @@ -1656,9 +1702,9 @@ static int do_change_type(struct path *path, int flag) static int do_loopback(struct path *path, const char *old_name, int recurse) { - LIST_HEAD(umount_list); struct path old_path; - struct mount *mnt = NULL, *old; + struct mount *mnt = NULL, *old, *parent; + struct mountpoint *mp; int err; if (!old_name || !*old_name) return -EINVAL; @@ -1670,17 +1716,19 @@ static int do_loopback(struct path *path, const char *old_name, if (mnt_ns_loop(&old_path)) goto out; - err = lock_mount(path); - if (err) + mp = lock_mount(path); + err = PTR_ERR(mp); + if (IS_ERR(mp)) goto out; old = real_mount(old_path.mnt); + parent = real_mount(path->mnt); err = -EINVAL; if (IS_MNT_UNBINDABLE(old)) goto out2; - if (!check_mnt(real_mount(path->mnt)) || !check_mnt(old)) + if (!check_mnt(parent) || !check_mnt(old)) goto out2; if (recurse) @@ -1693,15 +1741,14 @@ static int do_loopback(struct path *path, const char *old_name, goto out2; } - err = graft_tree(mnt, path); + err = graft_tree(mnt, parent, mp); if (err) { br_write_lock(&vfsmount_lock); - umount_tree(mnt, 0, &umount_list); + umount_tree(mnt, 0); br_write_unlock(&vfsmount_lock); } out2: - unlock_mount(path); - release_mounts(&umount_list); + unlock_mount(mp); out: path_put(&old_path); return err; @@ -1786,6 +1833,7 @@ static int do_move_mount(struct path *path, const char *old_name) struct path old_path, parent_path; struct mount *p; struct mount *old; + struct mountpoint *mp; int err; if (!old_name || !*old_name) return -EINVAL; @@ -1793,8 +1841,9 @@ static int do_move_mount(struct path *path, const char *old_name) if (err) return err; - err = lock_mount(path); - if (err < 0) + mp = lock_mount(path); + err = PTR_ERR(mp); + if (IS_ERR(mp)) goto out; old = real_mount(old_path.mnt); @@ -1804,9 +1853,6 @@ static int do_move_mount(struct path *path, const char *old_name) if (!check_mnt(p) || !check_mnt(old)) goto out1; - if (d_unlinked(path->dentry)) - goto out1; - err = -EINVAL; if (old_path.dentry != old_path.mnt->mnt_root) goto out1; @@ -1833,7 +1879,7 @@ static int do_move_mount(struct path *path, const char *old_name) if (p == old) goto out1; - err = attach_recursive_mnt(old, path, &parent_path); + err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path); if (err) goto out1; @@ -1841,7 +1887,7 @@ static int do_move_mount(struct path *path, const char *old_name) * automatically */ list_del_init(&old->mnt_expire); out1: - unlock_mount(path); + unlock_mount(mp); out: if (!err) path_put(&parent_path); @@ -1877,21 +1923,24 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) */ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) { + struct mountpoint *mp; + struct mount *parent; int err; mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); - err = lock_mount(path); - if (err) - return err; + mp = lock_mount(path); + if (IS_ERR(mp)) + return PTR_ERR(mp); + parent = real_mount(path->mnt); err = -EINVAL; - if (unlikely(!check_mnt(real_mount(path->mnt)))) { + if (unlikely(!check_mnt(parent))) { /* that's acceptable only for automounts done in private ns */ if (!(mnt_flags & MNT_SHRINKABLE)) goto unlock; /* ... and for those we'd better have mountpoint still alive */ - if (!real_mount(path->mnt)->mnt_ns) + if (!parent->mnt_ns) goto unlock; } @@ -1906,10 +1955,10 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) goto unlock; newmnt->mnt.mnt_flags = mnt_flags; - err = graft_tree(newmnt, path); + err = graft_tree(newmnt, parent, mp); unlock: - unlock_mount(path); + unlock_mount(mp); return err; } @@ -1982,11 +2031,11 @@ int finish_automount(struct vfsmount *m, struct path *path) fail: /* remove m from any expiration list it may be on */ if (!list_empty(&mnt->mnt_expire)) { - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); list_del_init(&mnt->mnt_expire); br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); + namespace_unlock(); } mntput(m); mntput(m); @@ -2000,13 +2049,13 @@ fail: */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); + namespace_unlock(); } EXPORT_SYMBOL(mnt_set_expiry); @@ -2019,12 +2068,11 @@ void mark_mounts_for_expiry(struct list_head *mounts) { struct mount *mnt, *next; LIST_HEAD(graveyard); - LIST_HEAD(umounts); if (list_empty(mounts)) return; - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); /* extract from the expiration list every vfsmount that matches the @@ -2042,12 +2090,10 @@ void mark_mounts_for_expiry(struct list_head *mounts) while (!list_empty(&graveyard)) { mnt = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(mnt->mnt_ns); - umount_tree(mnt, 1, &umounts); + umount_tree(mnt, 1); } br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); - - release_mounts(&umounts); + namespace_unlock(); } EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); @@ -2104,7 +2150,7 @@ resume: * * vfsmount_lock must be held for write */ -static void shrink_submounts(struct mount *mnt, struct list_head *umounts) +static void shrink_submounts(struct mount *mnt) { LIST_HEAD(graveyard); struct mount *m; @@ -2115,7 +2161,7 @@ static void shrink_submounts(struct mount *mnt, struct list_head *umounts) m = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(m->mnt_ns); - umount_tree(m, 1, umounts); + umount_tree(m, 1); } } } @@ -2238,12 +2284,11 @@ long do_mount(const char *dev_name, const char *dir_name, retval = security_sb_mount(dev_name, &path, type_page, flags, data_page); + if (!retval && !may_mount()) + retval = -EPERM; if (retval) goto dput_out; - if (!may_mount()) - return -EPERM; - /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) mnt_flags |= MNT_RELATIME; @@ -2342,14 +2387,14 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, if (IS_ERR(new_ns)) return new_ns; - down_write(&namespace_sem); + namespace_lock(); /* First pass: copy the tree topology */ copy_flags = CL_COPY_ALL | CL_EXPIRE; if (user_ns != mnt_ns->user_ns) copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { - up_write(&namespace_sem); + namespace_unlock(); free_mnt_ns(new_ns); return ERR_CAST(new); } @@ -2380,7 +2425,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, p = next_mnt(p, old); q = next_mnt(q, new); } - up_write(&namespace_sem); + namespace_unlock(); if (rootmnt) mntput(rootmnt); @@ -2418,7 +2463,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; new_ns->root = mnt; - list_add(&new_ns->list, &mnt->mnt_list); + list_add(&mnt->mnt_list, &new_ns->list); } else { mntput(m); } @@ -2550,7 +2595,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { struct path new, old, parent_path, root_parent, root; - struct mount *new_mnt, *root_mnt; + struct mount *new_mnt, *root_mnt, *old_mnt; + struct mountpoint *old_mp, *root_mp; int error; if (!may_mount()) @@ -2569,14 +2615,16 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out2; get_fs_root(current->fs, &root); - error = lock_mount(&old); - if (error) + old_mp = lock_mount(&old); + error = PTR_ERR(old_mp); + if (IS_ERR(old_mp)) goto out3; error = -EINVAL; new_mnt = real_mount(new.mnt); root_mnt = real_mount(root.mnt); - if (IS_MNT_SHARED(real_mount(old.mnt)) || + old_mnt = real_mount(old.mnt); + if (IS_MNT_SHARED(old_mnt) || IS_MNT_SHARED(new_mnt->mnt_parent) || IS_MNT_SHARED(root_mnt->mnt_parent)) goto out4; @@ -2585,37 +2633,37 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, error = -ENOENT; if (d_unlinked(new.dentry)) goto out4; - if (d_unlinked(old.dentry)) - goto out4; error = -EBUSY; - if (new.mnt == root.mnt || - old.mnt == root.mnt) + if (new_mnt == root_mnt || old_mnt == root_mnt) goto out4; /* loop, on the same file system */ error = -EINVAL; if (root.mnt->mnt_root != root.dentry) goto out4; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) goto out4; /* not attached */ + root_mp = root_mnt->mnt_mp; if (new.mnt->mnt_root != new.dentry) goto out4; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) goto out4; /* not attached */ /* make sure we can reach put_old from new_root */ - if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new)) + if (!is_path_reachable(old_mnt, old.dentry, &new)) goto out4; + root_mp->m_count++; /* pin it so it won't go away */ br_write_lock(&vfsmount_lock); detach_mnt(new_mnt, &parent_path); detach_mnt(root_mnt, &root_parent); /* mount old root on put_old */ - attach_mnt(root_mnt, &old); + attach_mnt(root_mnt, old_mnt, old_mp); /* mount new_root on / */ - attach_mnt(new_mnt, &root_parent); + attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); touch_mnt_namespace(current->nsproxy->mnt_ns); br_write_unlock(&vfsmount_lock); chroot_fs_refs(&root, &new); + put_mountpoint(root_mp); error = 0; out4: - unlock_mount(&old); + unlock_mount(old_mp); if (!error) { path_put(&root_parent); path_put(&parent_path); @@ -2670,14 +2718,17 @@ void __init mnt_init(void) 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); + mountpoint_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); - if (!mount_hashtable) + if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE); for (u = 0; u < HASH_SIZE; u++) INIT_LIST_HEAD(&mount_hashtable[u]); + for (u = 0; u < HASH_SIZE; u++) + INIT_LIST_HEAD(&mountpoint_hashtable[u]); br_lock_init(&vfsmount_lock); @@ -2694,16 +2745,13 @@ void __init mnt_init(void) void put_mnt_ns(struct mnt_namespace *ns) { - LIST_HEAD(umount_list); - if (!atomic_dec_and_test(&ns->count)) return; - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); - umount_tree(ns->root, 0, &umount_list); + umount_tree(ns->root, 0); br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); - release_mounts(&umount_list); + namespace_unlock(); free_mnt_ns(ns); } diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 816326093656..3be047474bfc 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -23,12 +23,12 @@ #include "ncp_fs.h" -static void ncp_read_volume_list(struct file *, void *, filldir_t, +static void ncp_read_volume_list(struct file *, struct dir_context *, struct ncp_cache_control *); -static void ncp_do_readdir(struct file *, void *, filldir_t, +static void ncp_do_readdir(struct file *, struct dir_context *, struct ncp_cache_control *); -static int ncp_readdir(struct file *, void *, filldir_t); +static int ncp_readdir(struct file *, struct dir_context *); static int ncp_create(struct inode *, struct dentry *, umode_t, bool); static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int); @@ -49,7 +49,7 @@ const struct file_operations ncp_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = ncp_readdir, + .iterate = ncp_readdir, .unlocked_ioctl = ncp_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ncp_compat_ioctl, @@ -73,10 +73,8 @@ const struct inode_operations ncp_dir_inode_operations = * Dentry operations routines */ static int ncp_lookup_validate(struct dentry *, unsigned int); -static int ncp_hash_dentry(const struct dentry *, const struct inode *, - struct qstr *); -static int ncp_compare_dentry(const struct dentry *, const struct inode *, - const struct dentry *, const struct inode *, +static int ncp_hash_dentry(const struct dentry *, struct qstr *); +static int ncp_compare_dentry(const struct dentry *, const struct dentry *, unsigned int, const char *, const struct qstr *); static int ncp_delete_dentry(const struct dentry *); @@ -119,11 +117,19 @@ static inline int ncp_case_sensitive(const struct inode *i) /* * Note: leave the hash unchanged if the directory * is case-sensitive. + * + * Accessing the parent inode can be racy under RCU pathwalking. + * Use ACCESS_ONCE() to make sure we use _one_ particular inode, + * the callers will handle races. */ static int -ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode, - struct qstr *this) +ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) { + struct inode *inode = ACCESS_ONCE(dentry->d_inode); + + if (!inode) + return 0; + if (!ncp_case_sensitive(inode)) { struct super_block *sb = dentry->d_sb; struct nls_table *t; @@ -140,14 +146,24 @@ ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode, return 0; } +/* + * Accessing the parent inode can be racy under RCU pathwalking. + * Use ACCESS_ONCE() to make sure we use _one_ particular inode, + * the callers will handle races. + */ static int -ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { + struct inode *pinode; + if (len != name->len) return 1; + pinode = ACCESS_ONCE(parent->d_inode); + if (!pinode) + return 1; + if (ncp_case_sensitive(pinode)) return strncmp(str, name->name, len); @@ -424,9 +440,9 @@ static time_t ncp_obtain_mtime(struct dentry *dentry) return ncp_date_dos2unix(i.modifyTime, i.modifyDate); } -static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int ncp_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; struct page *page = NULL; struct ncp_server *server = NCP_SERVER(inode); @@ -440,7 +456,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) DDPRINTK("ncp_readdir: reading %s/%s, pos=%d\n", dentry->d_parent->d_name.name, dentry->d_name.name, - (int) filp->f_pos); + (int) ctx->pos); result = -EIO; /* Do not generate '.' and '..' when server is dead. */ @@ -448,16 +464,8 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) goto out; result = 0; - if (filp->f_pos == 0) { - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR)) - goto out; - filp->f_pos = 1; - } - if (filp->f_pos == 1) { - if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR)) - goto out; - filp->f_pos = 2; - } + if (!dir_emit_dots(file, ctx)) + goto out; page = grab_cache_page(&inode->i_data, 0); if (!page) @@ -469,7 +477,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) if (!PageUptodate(page) || !ctl.head.eof) goto init_cache; - if (filp->f_pos == 2) { + if (ctx->pos == 2) { if (jiffies - ctl.head.time >= NCP_MAX_AGE(server)) goto init_cache; @@ -479,10 +487,10 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) goto init_cache; } - if (filp->f_pos > ctl.head.end) + if (ctx->pos > ctl.head.end) goto finished; - ctl.fpos = filp->f_pos + (NCP_DIRCACHE_START - 2); + ctl.fpos = ctx->pos + (NCP_DIRCACHE_START - 2); ctl.ofs = ctl.fpos / NCP_DIRCACHE_SIZE; ctl.idx = ctl.fpos % NCP_DIRCACHE_SIZE; @@ -497,21 +505,21 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) } while (ctl.idx < NCP_DIRCACHE_SIZE) { struct dentry *dent; - int res; + bool over; dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx], - dentry, filp->f_pos); + dentry, ctx->pos); if (!dent) goto invalid_cache; - res = filldir(dirent, dent->d_name.name, - dent->d_name.len, filp->f_pos, + over = !dir_emit(ctx, dent->d_name.name, + dent->d_name.len, dent->d_inode->i_ino, DT_UNKNOWN); dput(dent); - if (res) + if (over) goto finished; - filp->f_pos += 1; + ctx->pos += 1; ctl.idx += 1; - if (filp->f_pos > ctl.head.end) + if (ctx->pos > ctl.head.end) goto finished; } if (ctl.page) { @@ -548,9 +556,9 @@ init_cache: ctl.valid = 1; read_really: if (ncp_is_server_root(inode)) { - ncp_read_volume_list(filp, dirent, filldir, &ctl); + ncp_read_volume_list(file, ctx, &ctl); } else { - ncp_do_readdir(filp, dirent, filldir, &ctl); + ncp_do_readdir(file, ctx, &ctl); } ctl.head.end = ctl.fpos - 1; ctl.head.eof = ctl.valid; @@ -573,11 +581,11 @@ out: } static int -ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, +ncp_fill_cache(struct file *file, struct dir_context *ctx, struct ncp_cache_control *ctrl, struct ncp_entry_info *entry, int inval_childs) { - struct dentry *newdent, *dentry = filp->f_path.dentry; + struct dentry *newdent, *dentry = file->f_path.dentry; struct inode *dir = dentry->d_inode; struct ncp_cache_control ctl = *ctrl; struct qstr qname; @@ -666,15 +674,13 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, end_advance: if (!valid) ctl.valid = 0; - if (!ctl.filled && (ctl.fpos == filp->f_pos)) { - if (!ino) - ino = find_inode_number(dentry, &qname); + if (!ctl.filled && (ctl.fpos == ctx->pos)) { if (!ino) ino = iunique(dir->i_sb, 2); - ctl.filled = filldir(dirent, qname.name, qname.len, - filp->f_pos, ino, DT_UNKNOWN); + ctl.filled = !dir_emit(ctx, qname.name, qname.len, + ino, DT_UNKNOWN); if (!ctl.filled) - filp->f_pos += 1; + ctx->pos += 1; } ctl.fpos += 1; ctl.idx += 1; @@ -683,10 +689,10 @@ end_advance: } static void -ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir, +ncp_read_volume_list(struct file *file, struct dir_context *ctx, struct ncp_cache_control *ctl) { - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; struct ncp_server *server = NCP_SERVER(inode); struct ncp_volume_info info; @@ -694,7 +700,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir, int i; DPRINTK("ncp_read_volume_list: pos=%ld\n", - (unsigned long) filp->f_pos); + (unsigned long) ctx->pos); for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) { int inval_dentry; @@ -715,16 +721,16 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir, } inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL); entry.volume = entry.i.volNumber; - if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry)) + if (!ncp_fill_cache(file, ctx, ctl, &entry, inval_dentry)) return; } } static void -ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir, +ncp_do_readdir(struct file *file, struct dir_context *ctx, struct ncp_cache_control *ctl) { - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file->f_path.dentry; struct inode *dir = dentry->d_inode; struct ncp_server *server = NCP_SERVER(dir); struct nw_search_sequence seq; @@ -736,7 +742,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir, DPRINTK("ncp_do_readdir: %s/%s, fpos=%ld\n", dentry->d_parent->d_name.name, dentry->d_name.name, - (unsigned long) filp->f_pos); + (unsigned long) ctx->pos); PPRINTK("ncp_do_readdir: init %s, volnum=%d, dirent=%u\n", dentry->d_name.name, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum); @@ -778,7 +784,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir, rpl += onerpl; rpls -= onerpl; entry.volume = entry.i.volNumber; - if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0)) + if (!ncp_fill_cache(file, ctx, ctl, &entry, 0)) break; } } while (more); @@ -1029,15 +1035,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry) DPRINTK("ncp_rmdir: removing %s/%s\n", dentry->d_parent->d_name.name, dentry->d_name.name); - /* - * fail with EBUSY if there are still references to this - * directory. - */ - dentry_unhash(dentry); - error = -EBUSY; - if (!d_unhashed(dentry)) - goto out; - len = sizeof(__name); error = ncp_io2vol(server, __name, &len, dentry->d_name.name, dentry->d_name.len, !ncp_preserve_case(dir)); @@ -1140,17 +1137,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_parent->d_name.name, old_dentry->d_name.name, new_dentry->d_parent->d_name.name, new_dentry->d_name.name); - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) { - /* - * fail with EBUSY if there are still references to this - * directory. - */ - dentry_unhash(new_dentry); - error = -EBUSY; - if (!d_unhashed(new_dentry)) - goto out; - } - ncp_age_dentry(server, old_dentry); ncp_age_dentry(server, new_dentry); diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 26910c8154da..4659da67e7f6 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -403,18 +403,24 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) switch (optval) { case 'u': data->uid = make_kuid(current_user_ns(), optint); - if (!uid_valid(data->uid)) + if (!uid_valid(data->uid)) { + ret = -EINVAL; goto err; + } break; case 'g': data->gid = make_kgid(current_user_ns(), optint); - if (!gid_valid(data->gid)) + if (!gid_valid(data->gid)) { + ret = -EINVAL; goto err; + } break; case 'o': data->mounted_uid = make_kuid(current_user_ns(), optint); - if (!uid_valid(data->mounted_uid)) + if (!uid_valid(data->mounted_uid)) { + ret = -EINVAL; goto err; + } break; case 'm': data->file_mode = optint; @@ -891,6 +897,10 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr) if (!server) /* How this could happen? */ goto out; + result = -EPERM; + if (IS_DEADDIR(dentry->d_inode)) + goto out; + /* ageing the dentry to force validation */ ncp_age_dentry(server, dentry); diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index ee24df5af1f9..3c5dd55d284c 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -117,7 +117,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; /* we do not support files bigger than 4GB... We eventually supports just 4GB... */ - if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff + if (vma_pages(vma) + vma->vm_pgoff > (1U << (32 - PAGE_SHIFT))) return -EFBIG; diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 13ca196385f5..b5e80b0af315 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -104,6 +104,15 @@ config NFS_V4_1 If unsure, say N. +config NFS_V4_2 + bool "NFS client support for NFSv4.2" + depends on NFS_V4_1 + help + This option enables support for minor version 2 of the NFSv4 protocol + in the kernel's NFS client. + + If unsure, say N. + config PNFS_FILE_LAYOUT tristate depends on NFS_V4_1 @@ -131,6 +140,11 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN If the NFS client is unchanged from the upstream kernel, this option should be set to the default "kernel.org". +config NFS_V4_SECURITY_LABEL + bool + depends on NFS_V4_2 && SECURITY + default y + config ROOT_NFS bool "Root file system on NFS" depends on NFS_FS=y && IP_PNP diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index cce2c057bd2d..e0bb048e9576 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -6,8 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ direct.o pagelist.o read.o symlink.o unlink.o \ - write.o namespace.o mount_clnt.o \ - dns_resolve.o cache_lib.o + write.o namespace.o mount_clnt.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o @@ -22,7 +21,8 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o obj-$(CONFIG_NFS_V4) += nfsv4.o nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ - nfs4namespace.o nfs4getroot.o nfs4client.o + nfs4namespace.o nfs4getroot.o nfs4client.o dns_resolve.o +nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 434b93ec0970..e242bbf72972 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -1089,9 +1089,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, dev->pgbase = 0; dev->pglen = PAGE_SIZE * max_pages; dev->mincount = 0; + dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); - rc = nfs4_proc_getdeviceinfo(server, dev); + rc = nfs4_proc_getdeviceinfo(server, dev, NULL); dprintk("%s getdevice info returns %d\n", __func__, rc); if (rc) { rv = ERR_PTR(rc); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index f4891bde8851..8485978993e8 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -173,7 +173,7 @@ struct bl_msg_hdr { /* blocklayoutdev.c */ ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); void bl_pipe_destroy_msg(struct rpc_pipe_msg *); -int nfs4_blkdev_put(struct block_device *bdev); +void nfs4_blkdev_put(struct block_device *bdev); struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, struct pnfs_device *dev); int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c index a86c5bdad9e3..04303b5c9361 100644 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -56,11 +56,11 @@ static int decode_sector_number(__be32 **rp, sector_t *sp) /* * Release the block device */ -int nfs4_blkdev_put(struct block_device *bdev) +void nfs4_blkdev_put(struct block_device *bdev) { dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); - return blkdev_put(bdev, FMODE_READ); + blkdev_put(bdev, FMODE_READ); } ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c index 6fc7b5cae92b..8999cfddd866 100644 --- a/fs/nfs/blocklayout/blocklayoutdm.c +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -88,14 +88,8 @@ out: */ static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) { - int rv; - dprintk("%s Releasing\n", __func__); - rv = nfs4_blkdev_put(bdev->bm_mdev); - if (rv) - printk(KERN_ERR "NFS: %s nfs4_blkdev_put returns %d\n", - __func__, rv); - + nfs4_blkdev_put(bdev->bm_mdev); dev_remove(bdev->net, bdev->bm_mdev->bd_dev); } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 5088b57b078a..67cd73213168 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -125,6 +125,9 @@ nfs41_callback_svc(void *vrqstp) set_freezable(); while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { @@ -208,7 +211,6 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, struct svc_rqst *rqstp; int (*callback_svc)(void *vrqstp); struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; - char svc_name[12]; int ret; nfs_callback_bc_serv(minorversion, xprt, serv); @@ -232,10 +234,10 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, svc_sock_update_bufs(serv); - sprintf(svc_name, "nfsv4.%u-svc", minorversion); cb_info->serv = serv; cb_info->rqst = rqstp; - cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name); + cb_info->task = kthread_run(callback_svc, cb_info->rqst, + "nfsv4.%u-svc", minorversion); if (IS_ERR(cb_info->task)) { ret = PTR_ERR(cb_info->task); svc_exit_thread(cb_info->rqst); @@ -279,6 +281,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n ret = nfs4_callback_up_net(serv, net); break; case 1: + case 2: ret = nfs41_callback_up_net(serv, net); break; default: diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index efd54f0a4c46..84326e9fb47a 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -32,6 +32,8 @@ enum nfs4_callback_opnum { OP_CB_WANTS_CANCELLED = 12, OP_CB_NOTIFY_LOCK = 13, OP_CB_NOTIFY_DEVICEID = 14, +/* Callback operations new to NFSv4.2 */ + OP_CB_OFFLOAD = 15, OP_CB_ILLEGAL = 10044, }; @@ -39,6 +41,7 @@ struct cb_process_state { __be32 drc_status; struct nfs_client *clp; u32 slotid; + u32 minorversion; struct net *net; }; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 2960512792c2..e6ebc4c38c81 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -406,7 +406,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, int i; __be32 status = htonl(NFS4ERR_BADSESSION); - clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid); + clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, + &args->csa_sessionid, cps->minorversion); if (clp == NULL) goto out; @@ -414,7 +415,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, spin_lock(&tbl->slot_tbl_lock); /* state manager is resetting the session */ - if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { + if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { spin_unlock(&tbl->slot_tbl_lock); status = htonl(NFS4ERR_DELAY); /* Return NFS4ERR_BADSESSION if we're draining the session @@ -500,7 +501,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy, &args->craa_type_mask)) pnfs_recall_all_layouts(cps->clp); if (flags) - nfs_expire_all_delegation_types(cps->clp, flags); + nfs_expire_unused_delegation_types(cps->clp, flags); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 59461c957d9d..f4ccfe6521ec 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -166,9 +166,9 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); hdr->minorversion = ntohl(*p++); - /* Check minor version is zero or one. */ - if (hdr->minorversion <= 1) { - hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */ + /* Check for minor version support */ + if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) { + hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */ } else { pr_warn_ratelimited("NFS: %s: NFSv4 server callback with " "illegal minor version %u!\n", @@ -763,7 +763,7 @@ static void nfs4_callback_free_slot(struct nfs4_session *session) * A single slot, so highest used slotid is either 0 or -1 */ tbl->highest_used_slotid = NFS4_NO_SLOT; - nfs4_session_drain_complete(session, tbl); + nfs4_slot_tbl_drain_complete(tbl); spin_unlock(&tbl->slot_tbl_lock); } @@ -786,6 +786,26 @@ static void nfs4_cb_free_slot(struct cb_process_state *cps) } #endif /* CONFIG_NFS_V4_1 */ +#ifdef CONFIG_NFS_V4_2 +static __be32 +preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + __be32 status = preprocess_nfs41_op(nop, op_nr, op); + if (status != htonl(NFS4ERR_OP_ILLEGAL)) + return status; + + if (op_nr == OP_CB_OFFLOAD) + return htonl(NFS4ERR_NOTSUPP); + return htonl(NFS4ERR_OP_ILLEGAL); +} +#else /* CONFIG_NFS_V4_2 */ +static __be32 +preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + return htonl(NFS4ERR_MINOR_VERS_MISMATCH); +} +#endif /* CONFIG_NFS_V4_2 */ + static __be32 preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) { @@ -801,8 +821,7 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) return htonl(NFS_OK); } -static __be32 process_op(uint32_t minorversion, int nop, - struct svc_rqst *rqstp, +static __be32 process_op(int nop, struct svc_rqst *rqstp, struct xdr_stream *xdr_in, void *argp, struct xdr_stream *xdr_out, void *resp, struct cb_process_state *cps) @@ -819,10 +838,22 @@ static __be32 process_op(uint32_t minorversion, int nop, return status; dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", - __func__, minorversion, nop, op_nr); + __func__, cps->minorversion, nop, op_nr); + + switch (cps->minorversion) { + case 0: + status = preprocess_nfs4_op(op_nr, &op); + break; + case 1: + status = preprocess_nfs41_op(nop, op_nr, &op); + break; + case 2: + status = preprocess_nfs42_op(nop, op_nr, &op); + break; + default: + status = htonl(NFS4ERR_MINOR_VERS_MISMATCH); + } - status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) : - preprocess_nfs4_op(op_nr, &op); if (status == htonl(NFS4ERR_OP_ILLEGAL)) op_nr = OP_CB_ILLEGAL; if (status) @@ -885,14 +916,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r return rpc_drop_reply; } + cps.minorversion = hdr_arg.minorversion; hdr_res.taglen = hdr_arg.taglen; hdr_res.tag = hdr_arg.tag; if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) return rpc_system_err; while (status == 0 && nops != hdr_arg.nops) { - status = process_op(hdr_arg.minorversion, nops, rqstp, - &xdr_in, argp, &xdr_out, resp, &cps); + status = process_op(nops, rqstp, &xdr_in, + argp, &xdr_out, resp, &cps); nops++; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 84d8eae203a7..340b1eff0267 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -593,6 +593,8 @@ int nfs_create_rpc_client(struct nfs_client *clp, args.flags |= RPC_CLNT_CREATE_DISCRTRY; if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS; if (!IS_ERR(clp->cl_rpcclient)) return 0; @@ -751,8 +753,6 @@ static int nfs_init_server(struct nfs_server *server, data->timeo, data->retrans); if (data->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); - if (server->options & NFS_OPTION_MIGRATION) - set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); @@ -1074,7 +1074,7 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info, } if (!(fattr->valid & NFS_ATTR_FATTR)) { - error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr); + error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL); if (error < 0) { dprintk("nfs_create_server: getattr error = %d\n", -error); goto error; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 6390a4b5fee7..7ec4814e298d 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -64,31 +64,29 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags) return ret; } -static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state) +static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) { struct inode *inode = state->inode; struct file_lock *fl; int status = 0; if (inode->i_flock == NULL) - return 0; - - if (inode->i_flock == NULL) goto out; - /* Protect inode->i_flock using the file locks lock */ - lock_flocks(); + + /* Protect inode->i_flock using the i_lock */ + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) continue; if (nfs_file_open_context(fl->fl_file) != ctx) continue; - unlock_flocks(); - status = nfs4_lock_delegation_recall(state, fl); + spin_unlock(&inode->i_lock); + status = nfs4_lock_delegation_recall(fl, state, stateid); if (status < 0) goto out; - lock_flocks(); + spin_lock(&inode->i_lock); } - unlock_flocks(); + spin_unlock(&inode->i_lock); out: return status; } @@ -120,7 +118,7 @@ again: seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); err = nfs4_open_delegation_recall(ctx, state, stateid); if (!err) - err = nfs_delegation_claim_locks(ctx, state); + err = nfs_delegation_claim_locks(ctx, state, stateid); if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) err = -EAGAIN; mutex_unlock(&sp->so_delegreturn_mutex); @@ -389,6 +387,24 @@ out: return err; } +static bool nfs_delegation_need_return(struct nfs_delegation *delegation) +{ + bool ret = false; + + if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) + ret = true; + if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) { + struct inode *inode; + + spin_lock(&delegation->lock); + inode = delegation->inode; + if (inode && list_empty(&NFS_I(inode)->open_files)) + ret = true; + spin_unlock(&delegation->lock); + } + return ret; +} + /** * nfs_client_return_marked_delegations - return previously marked delegations * @clp: nfs_client to process @@ -411,8 +427,7 @@ restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry_rcu(delegation, &server->delegations, super_list) { - if (!test_and_clear_bit(NFS_DELEGATION_RETURN, - &delegation->flags)) + if (!nfs_delegation_need_return(delegation)) continue; inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) @@ -471,6 +486,13 @@ int nfs4_inode_return_delegation(struct inode *inode) return err; } +static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); +} + static void nfs_mark_return_delegation(struct nfs_server *server, struct nfs_delegation *delegation) { @@ -478,6 +500,45 @@ static void nfs_mark_return_delegation(struct nfs_server *server, set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); } +static bool nfs_server_mark_return_all_delegations(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + bool ret = false; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + nfs_mark_return_delegation(server, delegation); + ret = true; + } + return ret; +} + +static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_server_mark_return_all_delegations(server); + rcu_read_unlock(); +} + +static void nfs_delegation_run_state_manager(struct nfs_client *clp) +{ + if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) + nfs4_schedule_state_manager(clp); +} + +/** + * nfs_expire_all_delegations + * @clp: client to process + * + */ +void nfs_expire_all_delegations(struct nfs_client *clp) +{ + nfs_client_mark_return_all_delegations(clp); + nfs_delegation_run_state_manager(clp); +} + /** * nfs_super_return_all_delegations - return delegations for one superblock * @sb: sb to process @@ -486,24 +547,22 @@ static void nfs_mark_return_delegation(struct nfs_server *server, void nfs_server_return_all_delegations(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; - struct nfs_delegation *delegation; + bool need_wait; if (clp == NULL) return; rcu_read_lock(); - list_for_each_entry_rcu(delegation, &server->delegations, super_list) { - spin_lock(&delegation->lock); - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); - spin_unlock(&delegation->lock); - } + need_wait = nfs_server_mark_return_all_delegations(server); rcu_read_unlock(); - if (nfs_client_return_marked_delegations(clp) != 0) + if (need_wait) { nfs4_schedule_state_manager(clp); + nfs4_wait_clnt_recover(clp); + } } -static void nfs_mark_return_all_delegation_types(struct nfs_server *server, +static void nfs_mark_return_unused_delegation_types(struct nfs_server *server, fmode_t flags) { struct nfs_delegation *delegation; @@ -512,27 +571,21 @@ static void nfs_mark_return_all_delegation_types(struct nfs_server *server, if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) continue; if (delegation->type & flags) - nfs_mark_return_delegation(server, delegation); + nfs_mark_return_if_closed_delegation(server, delegation); } } -static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, +static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *clp, fmode_t flags) { struct nfs_server *server; rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) - nfs_mark_return_all_delegation_types(server, flags); + nfs_mark_return_unused_delegation_types(server, flags); rcu_read_unlock(); } -static void nfs_delegation_run_state_manager(struct nfs_client *clp) -{ - if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) - nfs4_schedule_state_manager(clp); -} - void nfs_remove_bad_delegation(struct inode *inode) { struct nfs_delegation *delegation; @@ -546,27 +599,17 @@ void nfs_remove_bad_delegation(struct inode *inode) EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); /** - * nfs_expire_all_delegation_types + * nfs_expire_unused_delegation_types * @clp: client to process * @flags: delegation types to expire * */ -void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags) +void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags) { - nfs_client_mark_return_all_delegation_types(clp, flags); + nfs_client_mark_return_unused_delegation_types(clp, flags); nfs_delegation_run_state_manager(clp); } -/** - * nfs_expire_all_delegations - * @clp: client to process - * - */ -void nfs_expire_all_delegations(struct nfs_client *clp) -{ - nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); -} - static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) { struct nfs_delegation *delegation; @@ -574,7 +617,7 @@ static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) continue; - nfs_mark_return_delegation(server, delegation); + nfs_mark_return_if_closed_delegation(server, delegation); } } diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index d54d4fca6793..9a79c7a99d6d 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -28,6 +28,7 @@ struct nfs_delegation { enum { NFS_DELEGATION_NEED_RECLAIM = 0, NFS_DELEGATION_RETURN, + NFS_DELEGATION_RETURN_IF_CLOSED, NFS_DELEGATION_REFERENCED, NFS_DELEGATION_RETURNING, }; @@ -41,7 +42,7 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); void nfs_server_return_all_delegations(struct nfs_server *); void nfs_expire_all_delegations(struct nfs_client *clp); -void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags); +void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags); void nfs_expire_unreferenced_delegations(struct nfs_client *clp); int nfs_client_return_marked_delegations(struct nfs_client *clp); int nfs_delegations_present(struct nfs_client *clp); @@ -53,7 +54,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp); /* NFSv4 delegation-related procedures */ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); -int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); +int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f23f455be42b..0fac2cb1ea18 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -33,6 +33,7 @@ #include <linux/pagevec.h> #include <linux/namei.h> #include <linux/mount.h> +#include <linux/swap.h> #include <linux/sched.h> #include <linux/kmemleak.h> #include <linux/xattr.h> @@ -46,7 +47,7 @@ static int nfs_opendir(struct inode *, struct file *); static int nfs_closedir(struct inode *, struct file *); -static int nfs_readdir(struct file *, void *, filldir_t); +static int nfs_readdir(struct file *, struct dir_context *); static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); static void nfs_readdir_clear_array(struct page*); @@ -54,7 +55,7 @@ static void nfs_readdir_clear_array(struct page*); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, .read = generic_read_dir, - .readdir = nfs_readdir, + .iterate = nfs_readdir, .open = nfs_opendir, .release = nfs_closedir, .fsync = nfs_fsync_dir, @@ -147,6 +148,7 @@ typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int); typedef struct { struct file *file; struct page *page; + struct dir_context *ctx; unsigned long page_index; u64 *dir_cookie; u64 last_cookie; @@ -252,7 +254,7 @@ out: static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) { - loff_t diff = desc->file->f_pos - desc->current_index; + loff_t diff = desc->ctx->pos - desc->current_index; unsigned int index; if (diff < 0) @@ -289,7 +291,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { ctx->duped = 0; ctx->attr_gencount = nfsi->attr_gencount; - } else if (new_pos < desc->file->f_pos) { + } else if (new_pos < desc->ctx->pos) { if (ctx->duped > 0 && ctx->dup_cookie == *desc->dir_cookie) { if (printk_ratelimit()) { @@ -307,7 +309,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des ctx->dup_cookie = *desc->dir_cookie; ctx->duped = -1; } - desc->file->f_pos = new_pos; + desc->ctx->pos = new_pos; desc->cache_entry_index = i; return 0; } @@ -405,13 +407,13 @@ different: } static -bool nfs_use_readdirplus(struct inode *dir, struct file *filp) +bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx) { if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) return false; if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags)) return true; - if (filp->f_pos == 0) + if (ctx->pos == 0) return true; return false; } @@ -435,6 +437,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) struct dentry *alias; struct inode *dir = parent->d_inode; struct inode *inode; + int status; if (filename.name[0] == '.') { if (filename.len == 1) @@ -447,7 +450,9 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) dentry = d_lookup(parent, &filename); if (dentry != NULL) { if (nfs_same_file(dentry, entry)) { - nfs_refresh_inode(dentry->d_inode, entry->fattr); + status = nfs_refresh_inode(dentry->d_inode, entry->fattr); + if (!status) + nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label); goto out; } else { if (d_invalidate(dentry) != 0) @@ -460,7 +465,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) if (dentry == NULL) return; - inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); + inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label); if (IS_ERR(inode)) goto out; @@ -585,10 +590,16 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, if (entry.fh == NULL || entry.fattr == NULL) goto out; + entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); + if (IS_ERR(entry.label)) { + status = PTR_ERR(entry.label); + goto out; + } + array = nfs_readdir_get_array(page); if (IS_ERR(array)) { status = PTR_ERR(array); - goto out; + goto out_label_free; } memset(array, 0, sizeof(struct nfs_cache_array)); array->eof_index = -1; @@ -614,6 +625,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, nfs_readdir_free_large_page(pages_ptr, pages, array_size); out_release_array: nfs_readdir_release_array(page); +out_label_free: + nfs4_label_free(entry.label); out: nfs_free_fattr(entry.fattr); nfs_free_fhandle(entry.fh); @@ -702,8 +715,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) * Once we've found the start of the dirent within a page: fill 'er up... */ static -int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, - filldir_t filldir) +int nfs_do_filldir(nfs_readdir_descriptor_t *desc) { struct file *file = desc->file; int i = 0; @@ -721,13 +733,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, struct nfs_cache_array_entry *ent; ent = &array->array[i]; - if (filldir(dirent, ent->string.name, ent->string.len, - file->f_pos, nfs_compat_user_ino64(ent->ino), - ent->d_type) < 0) { + if (!dir_emit(desc->ctx, ent->string.name, ent->string.len, + nfs_compat_user_ino64(ent->ino), ent->d_type)) { desc->eof = 1; break; } - file->f_pos++; + desc->ctx->pos++; if (i < (array->size-1)) *desc->dir_cookie = array->array[i+1].cookie; else @@ -759,8 +770,7 @@ out: * directory in the page cache by the time we get here. */ static inline -int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, - filldir_t filldir) +int uncached_readdir(nfs_readdir_descriptor_t *desc) { struct page *page = NULL; int status; @@ -785,7 +795,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, if (status < 0) goto out_release; - status = nfs_do_filldir(desc, dirent, filldir); + status = nfs_do_filldir(desc); out: dfprintk(DIRCACHE, "NFS: %s: returns %d\n", @@ -800,35 +810,36 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, last cookie cache takes care of the common case of reading the whole directory. */ -static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int nfs_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; nfs_readdir_descriptor_t my_desc, *desc = &my_desc; - struct nfs_open_dir_context *dir_ctx = filp->private_data; + struct nfs_open_dir_context *dir_ctx = file->private_data; int res; dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", dentry->d_parent->d_name.name, dentry->d_name.name, - (long long)filp->f_pos); + (long long)ctx->pos); nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); /* - * filp->f_pos points to the dirent entry number. + * ctx->pos points to the dirent entry number. * *desc->dir_cookie has the cookie for the next entry. We have * to either find the entry with the appropriate number or * revalidate the cookie. */ memset(desc, 0, sizeof(*desc)); - desc->file = filp; + desc->file = file; + desc->ctx = ctx; desc->dir_cookie = &dir_ctx->dir_cookie; desc->decode = NFS_PROTO(inode)->decode_dirent; - desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0; + desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; nfs_block_sillyrename(dentry); - res = nfs_revalidate_mapping(inode, filp->f_mapping); + res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) goto out; @@ -840,7 +851,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) /* This means either end of directory */ if (*desc->dir_cookie && desc->eof == 0) { /* Or that the server has 'lost' a cookie */ - res = uncached_readdir(desc, dirent, filldir); + res = uncached_readdir(desc); if (res == 0) continue; } @@ -857,7 +868,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (res < 0) break; - res = nfs_do_filldir(desc, dirent, filldir); + res = nfs_do_filldir(desc); if (res < 0) break; } while (!desc->eof); @@ -1040,6 +1051,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) struct dentry *parent; struct nfs_fh *fhandle = NULL; struct nfs_fattr *fattr = NULL; + struct nfs4_label *label = NULL; int error; if (flags & LOOKUP_RCU) @@ -1082,7 +1094,11 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (fhandle == NULL || fattr == NULL) goto out_error; - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); + if (IS_ERR(label)) + goto out_error; + + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); if (error) goto out_bad; if (nfs_compare_fh(NFS_FH(inode), fhandle)) @@ -1090,8 +1106,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if ((error = nfs_refresh_inode(inode, fattr)) != 0) goto out_bad; + nfs_setsecurity(inode, fattr, label); + nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); + nfs4_label_free(label); + out_set_verifier: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_valid: @@ -1108,6 +1128,7 @@ out_zap_parent: out_bad: nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); + nfs4_label_free(label); nfs_mark_for_revalidate(dir); if (inode && S_ISDIR(inode->i_mode)) { /* Purge readdir caches. */ @@ -1128,6 +1149,7 @@ out_zap_parent: out_error: nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); + nfs4_label_free(label); dput(parent); dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n", __func__, dentry->d_parent->d_name.name, @@ -1256,6 +1278,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in struct inode *inode = NULL; struct nfs_fh *fhandle = NULL; struct nfs_fattr *fattr = NULL; + struct nfs4_label *label = NULL; int error; dfprintk(VFS, "NFS: lookup(%s/%s)\n", @@ -1282,17 +1305,21 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in if (fhandle == NULL || fattr == NULL) goto out; + label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT); + if (IS_ERR(label)) + goto out; + parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ nfs_block_sillyrename(parent); - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); if (error == -ENOENT) goto no_entry; if (error < 0) { res = ERR_PTR(error); goto out_unblock_sillyrename; } - inode = nfs_fhget(dentry->d_sb, fhandle, fattr); + inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); res = ERR_CAST(inode); if (IS_ERR(res)) goto out_unblock_sillyrename; @@ -1310,6 +1337,7 @@ no_entry: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_unblock_sillyrename: nfs_unblock_sillyrename(parent); + nfs4_label_free(label); out: nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); @@ -1357,18 +1385,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx, { int err; - if (ctx->dentry != dentry) { - dput(ctx->dentry); - ctx->dentry = dget(dentry); - } - - /* If the open_intent is for execute, we have an extra check to make */ - if (ctx->mode & FMODE_EXEC) { - err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags); - if (err < 0) - goto out; - } - err = finish_open(file, dentry, do_open, opened); if (err) goto out; @@ -1427,13 +1443,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, nfs_block_sillyrename(dentry->d_parent); inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); - d_drop(dentry); + nfs_unblock_sillyrename(dentry->d_parent); if (IS_ERR(inode)) { - nfs_unblock_sillyrename(dentry->d_parent); put_nfs_open_context(ctx); err = PTR_ERR(inode); switch (err) { case -ENOENT: + d_drop(dentry); d_add(dentry, NULL); break; case -EISDIR: @@ -1449,16 +1465,8 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, } goto out; } - res = d_add_unique(dentry, inode); - if (res != NULL) - dentry = res; - - nfs_unblock_sillyrename(dentry->d_parent); - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - - err = nfs_finish_open(ctx, dentry, file, open_flags, opened); - dput(res); + err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened); out: return err; @@ -1486,6 +1494,8 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) goto no_open; if (d_mountpoint(dentry)) goto no_open; + if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1) + goto no_open; inode = dentry->d_inode; parent = dget_parent(dentry); @@ -1526,7 +1536,8 @@ no_open: * Code common to create, mkdir, and mknod. */ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, + struct nfs4_label *label) { struct dentry *parent = dget_parent(dentry); struct inode *dir = parent->d_inode; @@ -1539,18 +1550,18 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, if (dentry->d_inode) goto out; if (fhandle->size == 0) { - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); if (error) goto out_error; } nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); if (!(fattr->valid & NFS_ATTR_FATTR)) { struct nfs_server *server = NFS_SB(dentry->d_sb); - error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr); + error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL); if (error < 0) goto out_error; } - inode = nfs_fhget(dentry->d_sb, fhandle, fattr); + inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); error = PTR_ERR(inode); if (IS_ERR(inode)) goto out_error; @@ -1719,7 +1730,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) dir->i_ino, dentry->d_name.name); spin_lock(&dentry->d_lock); - if (dentry->d_count > 1) { + if (d_count(dentry) > 1) { spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ write_inode_now(dentry->d_inode, 0); @@ -1757,7 +1768,6 @@ EXPORT_SYMBOL_GPL(nfs_unlink); */ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct pagevec lru_pvec; struct page *page; char *kaddr; struct iattr attr; @@ -1797,11 +1807,8 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) * No big deal if we can't add this page to the page cache here. * READLINK will get the missing page from the server if needed. */ - pagevec_init(&lru_pvec, 0); - if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, + if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0, GFP_KERNEL)) { - pagevec_add(&lru_pvec, page); - pagevec_lru_add_file(&lru_pvec); SetPageUptodate(page); unlock_page(page); } else @@ -1868,7 +1875,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", old_dentry->d_parent->d_name.name, old_dentry->d_name.name, new_dentry->d_parent->d_name.name, new_dentry->d_name.name, - new_dentry->d_count); + d_count(new_dentry)); /* * For non-directories, check whether the target is busy and if so, @@ -1886,7 +1893,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, rehash = new_dentry; } - if (new_dentry->d_count > 2) { + if (d_count(new_dentry) > 2) { int err; /* copy the target dentry's name */ diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index 945527092295..fc0f95ec7358 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -29,7 +29,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, kfree(ip_addr); return ret; } -EXPORT_SYMBOL_GPL(nfs_dns_resolve_name); #else @@ -351,7 +350,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, ret = -ESRCH; return ret; } -EXPORT_SYMBOL_GPL(nfs_dns_resolve_name); static struct cache_detail nfs_dns_resolve_template = { .owner = THIS_MODULE, @@ -396,6 +394,21 @@ void nfs_dns_resolver_cache_destroy(struct net *net) cache_destroy_net(nn->nfs_dns_resolve, net); } +static int nfs4_dns_net_init(struct net *net) +{ + return nfs_dns_resolver_cache_init(net); +} + +static void nfs4_dns_net_exit(struct net *net) +{ + nfs_dns_resolver_cache_destroy(net); +} + +static struct pernet_operations nfs4_dns_resolver_ops = { + .init = nfs4_dns_net_init, + .exit = nfs4_dns_net_exit, +}; + static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) { @@ -432,11 +445,24 @@ static struct notifier_block nfs_dns_resolver_block = { int nfs_dns_resolver_init(void) { - return rpc_pipefs_notifier_register(&nfs_dns_resolver_block); + int err; + + err = register_pernet_subsys(&nfs4_dns_resolver_ops); + if (err < 0) + goto out; + err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block); + if (err < 0) + goto out1; + return 0; +out1: + unregister_pernet_subsys(&nfs4_dns_resolver_ops); +out: + return err; } void nfs_dns_resolver_destroy(void) { rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block); + unregister_pernet_subsys(&nfs4_dns_resolver_ops); } #endif diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 29f4a48a0ee6..94e94bd11aae 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -451,11 +451,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, * - Called if either PG_private or PG_fscache is set on the page * - Caller holds page lock */ -static void nfs_invalidate_page(struct page *page, unsigned long offset) +static void nfs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length) { - dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); + dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n", + page, offset, length); - if (offset != 0) + if (offset != 0 || length < PAGE_CACHE_SIZE) return; /* Cancel any unstarted writes on this page */ nfs_wb_page_cancel(page_file_mapping(page)->host, page); @@ -493,6 +495,35 @@ static int nfs_release_page(struct page *page, gfp_t gfp) return nfs_fscache_release_page(page, gfp); } +static void nfs_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct nfs_inode *nfsi; + struct address_space *mapping = page_file_mapping(page); + + if (!mapping || PageSwapCache(page)) + return; + + /* + * Check if an unstable page is currently being committed and + * if so, have the VM treat it as if the page is under writeback + * so it will not block due to pages that will shortly be freeable. + */ + nfsi = NFS_I(mapping->host); + if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) { + *writeback = true; + return; + } + + /* + * If PagePrivate() is set, then the page is not freeable and as the + * inode is not being committed, it's not going to be cleaned in the + * near future so treat it as dirty + */ + if (PagePrivate(page)) + *dirty = true; +} + /* * Attempt to clear the private state associated with a page when an error * occurs that requires the cached contents of an inode to be written back or @@ -540,6 +571,7 @@ const struct address_space_operations nfs_file_aops = { .direct_IO = nfs_direct_IO, .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, + .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_page = generic_error_remove_page, #ifdef CONFIG_NFS_SWAP .swap_activate = nfs_swap_activate, @@ -744,6 +776,7 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; + struct nfs_lock_context *l_ctx; int status; /* @@ -752,6 +785,14 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) */ nfs_sync_mapping(filp->f_mapping); + l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); + if (!IS_ERR(l_ctx)) { + status = nfs_iocounter_wait(&l_ctx->io_count); + nfs_put_lock_context(l_ctx); + if (status < 0) + return status; + } + /* NOTE: special case * If we're signalled while cleaning up locks on process exit, we * still need to complete the unlock. diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 44efaa8c5f78..66984a9aafaa 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -95,7 +95,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, goto out; } - inode = nfs_fhget(sb, mntfh, fsinfo.fattr); + inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL); if (IS_ERR(inode)) { dprintk("nfs_get_root: get root inode failed\n"); ret = ERR_CAST(inode); diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index c516da5873fd..c2c4163d5683 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -262,29 +262,42 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen, return desclen; } -static ssize_t nfs_idmap_request_key(struct key_type *key_type, - const char *name, size_t namelen, - const char *type, void *data, - size_t data_size, struct idmap *idmap) +static struct key *nfs_idmap_request_key(const char *name, size_t namelen, + const char *type, struct idmap *idmap) { - const struct cred *saved_cred; - struct key *rkey; char *desc; - struct user_key_payload *payload; + struct key *rkey; ssize_t ret; ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); if (ret <= 0) - goto out; + return ERR_PTR(ret); + + rkey = request_key(&key_type_id_resolver, desc, ""); + if (IS_ERR(rkey)) { + mutex_lock(&idmap->idmap_mutex); + rkey = request_key_with_auxdata(&key_type_id_resolver_legacy, + desc, "", 0, idmap); + mutex_unlock(&idmap->idmap_mutex); + } + + kfree(desc); + return rkey; +} + +static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, + const char *type, void *data, + size_t data_size, struct idmap *idmap) +{ + const struct cred *saved_cred; + struct key *rkey; + struct user_key_payload *payload; + ssize_t ret; saved_cred = override_creds(id_resolver_cache); - if (idmap) - rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap); - else - rkey = request_key(&key_type_id_resolver, desc, ""); + rkey = nfs_idmap_request_key(name, namelen, type, idmap); revert_creds(saved_cred); - kfree(desc); if (IS_ERR(rkey)) { ret = PTR_ERR(rkey); goto out; @@ -316,23 +329,6 @@ out: return ret; } -static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, - const char *type, void *data, - size_t data_size, struct idmap *idmap) -{ - ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver, - name, namelen, type, data, - data_size, NULL); - if (ret < 0) { - mutex_lock(&idmap->idmap_mutex); - ret = nfs_idmap_request_key(&key_type_id_resolver_legacy, - name, namelen, type, data, - data_size, idmap); - mutex_unlock(&idmap->idmap_mutex); - } - return ret; -} - /* ID -> Name */ static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen, struct idmap *idmap) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 1f941674b089..c93639e6cf68 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -48,7 +48,6 @@ #include "iostat.h" #include "internal.h" #include "fscache.h" -#include "dns_resolve.h" #include "pnfs.h" #include "nfs.h" #include "netns.h" @@ -79,7 +78,7 @@ int nfs_wait_bit_killable(void *word) { if (fatal_signal_pending(current)) return -ERESTARTSYS; - freezable_schedule(); + freezable_schedule_unsafe(); return 0; } EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); @@ -162,11 +161,19 @@ static void nfs_zap_caches_locked(struct inode *inode) memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; nfs_fscache_invalidate(inode); - } else { - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; - } + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_LABEL + | NFS_INO_INVALID_DATA + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_PAGECACHE; + } else + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_LABEL + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_PAGECACHE; } void nfs_zap_caches(struct inode *inode) @@ -257,12 +264,72 @@ nfs_init_locked(struct inode *inode, void *opaque) return 0; } +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, + struct nfs4_label *label) +{ + int error; + + if (label == NULL) + return; + + if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0) + return; + + if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2) + return; + + if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) { + error = security_inode_notifysecctx(inode, label->label, + label->len); + if (error) + printk(KERN_ERR "%s() %s %d " + "security_inode_notifysecctx() %d\n", + __func__, + (char *)label->label, + label->len, error); + } +} + +struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) +{ + struct nfs4_label *label = NULL; + int minor_version = server->nfs_client->cl_minorversion; + + if (minor_version < 2) + return label; + + if (!(server->caps & NFS_CAP_SECURITY_LABEL)) + return label; + + label = kzalloc(sizeof(struct nfs4_label), flags); + if (label == NULL) + return ERR_PTR(-ENOMEM); + + label->label = kzalloc(NFS4_MAXLABELLEN, flags); + if (label->label == NULL) { + kfree(label); + return ERR_PTR(-ENOMEM); + } + label->len = NFS4_MAXLABELLEN; + + return label; +} +EXPORT_SYMBOL_GPL(nfs4_label_alloc); +#else +void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, + struct nfs4_label *label) +{ +} +#endif +EXPORT_SYMBOL_GPL(nfs_setsecurity); + /* * This is our front-end to iget that looks up inodes by file handle * instead of inode number. */ struct inode * -nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) +nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs_find_desc desc = { .fh = fh, @@ -384,6 +451,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); } + + nfs_setsecurity(inode, fattr, label); + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; nfsi->access_cache = RB_ROOT; @@ -393,6 +463,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) unlock_new_inode(inode); } else nfs_refresh_inode(inode, fattr); + nfs_setsecurity(inode, fattr, label); dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), @@ -449,7 +520,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) NFS_PROTO(inode)->return_delegation(inode); error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); if (error == 0) - nfs_refresh_inode(inode, fattr); + error = nfs_refresh_inode(inode, fattr); nfs_free_fattr(fattr); out: return error; @@ -561,20 +632,22 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) l_ctx->lockowner.l_owner = current->files; l_ctx->lockowner.l_pid = current->tgid; INIT_LIST_HEAD(&l_ctx->list); + nfs_iocounter_init(&l_ctx->io_count); } static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) { - struct nfs_lock_context *pos; + struct nfs_lock_context *head = &ctx->lock_context; + struct nfs_lock_context *pos = head; - list_for_each_entry(pos, &ctx->lock_context.list, list) { + do { if (pos->lockowner.l_owner != current->files) continue; if (pos->lockowner.l_pid != current->tgid) continue; atomic_inc(&pos->count); return pos; - } + } while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head); return NULL; } @@ -711,16 +784,23 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context); * Ensure that mmap has a recent RPC credential for use when writing out * shared pages */ -void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) +void nfs_inode_attach_open_context(struct nfs_open_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = ctx->dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); - filp->private_data = get_nfs_open_context(ctx); spin_lock(&inode->i_lock); list_add(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); + +void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) +{ + filp->private_data = get_nfs_open_context(ctx); + if (list_empty(&ctx->list)) + nfs_inode_attach_open_context(ctx); +} EXPORT_SYMBOL_GPL(nfs_file_set_open_context); /* @@ -746,10 +826,11 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c static void nfs_file_clear_open_context(struct file *filp) { - struct inode *inode = file_inode(filp); struct nfs_open_context *ctx = nfs_file_open_context(filp); if (ctx) { + struct inode *inode = ctx->dentry->d_inode; + filp->private_data = NULL; spin_lock(&inode->i_lock); list_move_tail(&ctx->list, &NFS_I(inode)->open_files); @@ -788,6 +869,7 @@ int __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { int status = -ESTALE; + struct nfs4_label *label = NULL; struct nfs_fattr *fattr = NULL; struct nfs_inode *nfsi = NFS_I(inode); @@ -805,7 +887,14 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) goto out; nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); - status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr); + + label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); + if (IS_ERR(label)) { + status = PTR_ERR(label); + goto out; + } + + status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label); if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", inode->i_sb->s_id, @@ -815,7 +904,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (!S_ISDIR(inode->i_mode)) set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); } - goto out; + goto err_out; } status = nfs_refresh_inode(inode, fattr); @@ -823,7 +912,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), status); - goto out; + goto err_out; } if (nfsi->cache_validity & NFS_INO_INVALID_ACL) @@ -833,7 +922,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) inode->i_sb->s_id, (long long)NFS_FILEID(inode)); - out: +err_out: + nfs4_label_free(label); +out: nfs_free_fattr(fattr); return status; } @@ -861,7 +952,8 @@ static int nfs_attribute_cache_expired(struct inode *inode) */ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { - if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) + if (!(NFS_I(inode)->cache_validity & + (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) && !nfs_attribute_cache_expired(inode)) return NFS_STALE(inode) ? -ESTALE : 0; return __nfs_revalidate_inode(server, inode); @@ -1241,6 +1333,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) spin_lock(&inode->i_lock); status = nfs_post_op_update_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); + return status; } EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); @@ -1481,7 +1574,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_blocks = fattr->du.nfs2.blocks; /* Update attrtimeo value if we're out of the unstable period */ - if (invalid & NFS_INO_INVALID_ATTR) { + if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) { nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; @@ -1494,6 +1587,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } } invalid &= ~NFS_INO_INVALID_ATTR; + invalid &= ~NFS_INO_INVALID_LABEL; /* Don't invalidate the data if we were to blame */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) @@ -1636,12 +1730,11 @@ EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) { nfs_clients_init(net); - return nfs_dns_resolver_cache_init(net); + return 0; } static void nfs_net_exit(struct net *net) { - nfs_dns_resolver_cache_destroy(net); nfs_cleanup_cb_ident_idr(net); } @@ -1659,10 +1752,6 @@ static int __init init_nfs_fs(void) { int err; - err = nfs_dns_resolver_init(); - if (err < 0) - goto out10;; - err = register_pernet_subsys(&nfs_net_ops); if (err < 0) goto out9; @@ -1728,8 +1817,6 @@ out7: out8: unregister_pernet_subsys(&nfs_net_ops); out9: - nfs_dns_resolver_destroy(); -out10: return err; } @@ -1742,7 +1829,6 @@ static void __exit exit_nfs_fs(void) nfs_destroy_nfspagecache(); nfs_fscache_unregister(); unregister_pernet_subsys(&nfs_net_ops); - nfs_dns_resolver_destroy(); #ifdef CONFIG_PROC_FS rpc_proc_unregister(&init_net, "nfs"); #endif diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 541c9ebdbc5a..3c8373f90ab3 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -165,7 +165,7 @@ extern void nfs_free_client(struct nfs_client *); extern struct nfs_client *nfs4_find_client_ident(struct net *, int); extern struct nfs_client * nfs4_find_client_sessionid(struct net *, const struct sockaddr *, - struct nfs4_sessionid *); + struct nfs4_sessionid *, u32); extern struct nfs_server *nfs_create_server(struct nfs_mount_info *, struct nfs_subversion *); extern struct nfs_server *nfs4_create_server( @@ -229,6 +229,13 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); +int nfs_iocounter_wait(struct nfs_io_counter *c); + +static inline void nfs_iocounter_init(struct nfs_io_counter *c) +{ + c->flags = 0; + atomic_set(&c->io_count, 0); +} /* nfs2xdr.c */ extern struct rpc_procinfo nfs_procedures[]; @@ -248,6 +255,7 @@ extern int nfs4_decode_dirent(struct xdr_stream *, #ifdef CONFIG_NFS_V4_1 extern const u32 nfs41_maxread_overhead; extern const u32 nfs41_maxwrite_overhead; +extern const u32 nfs41_maxgetdevinfo_overhead; #endif /* nfs4proc.c */ diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 91a6faf811ac..99a45283b9ee 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -139,7 +139,10 @@ struct mnt_fhstatus { * nfs_mount - Obtain an NFS file handle for the given host and path * @info: pointer to mount request arguments * - * Uses default timeout parameters specified by underlying transport. + * Uses default timeout parameters specified by underlying transport. On + * successful return, the auth_flavs list and auth_flav_len will be populated + * with the list from the server or a faked-up list if the server didn't + * provide one. */ int nfs_mount(struct nfs_mount_request *info) { @@ -195,6 +198,15 @@ int nfs_mount(struct nfs_mount_request *info) dprintk("NFS: MNT request succeeded\n"); status = 0; + /* + * If the server didn't provide a flavor list, allow the + * client to try any flavor. + */ + if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) { + dprintk("NFS: Faking up auth_flavs list\n"); + info->auth_flavs[0] = RPC_AUTH_NULL; + *info->auth_flav_len = 1; + } out: return status; diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index fc8dc20fdeb9..348b535cd786 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -280,7 +280,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry, struct dentry *parent = dget_parent(dentry); /* Look it up again to get its attributes */ - err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr); + err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL); dput(parent); if (err != 0) return ERR_PTR(err); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 43ea96ced28c..f5c84c3efbca 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -33,7 +33,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) res = rpc_call_sync(clnt, msg, flags); if (res != -EJUKEBOX) break; - freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); + freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; } while (!fatal_signal_pending(current)); return res; @@ -98,7 +98,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, */ static int nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, struct nfs4_label *label) { struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR], @@ -143,7 +143,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, static int nfs3_proc_lookup(struct inode *dir, struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + struct nfs4_label *label) { struct nfs3_diropargs arg = { .fh = NFS_FH(dir), @@ -300,7 +301,7 @@ static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_ status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); nfs_post_op_update_inode(dir, data->res.dir_attr); if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); return status; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 944c9a5c1039..ee81e354bce7 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -36,6 +36,7 @@ enum nfs4_client_state { struct nfs4_minor_version_ops { u32 minor_version; + unsigned init_caps; int (*call_sync)(struct rpc_clnt *clnt, struct nfs_server *server, @@ -46,6 +47,8 @@ struct nfs4_minor_version_ops { const nfs4_stateid *); int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); + int (*free_lock_state)(struct nfs_server *, + struct nfs4_lock_state *); const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; const struct nfs4_state_maintenance_ops *state_renewal_ops; @@ -143,12 +146,14 @@ struct nfs4_lock_state { enum { LK_STATE_IN_USE, NFS_DELEGATED_STATE, /* Current stateid is delegation */ + NFS_OPEN_STATE, /* OPEN stateid is set */ NFS_O_RDONLY_STATE, /* OPEN stateid has read-only state */ NFS_O_WRONLY_STATE, /* OPEN stateid has write-only state */ NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */ NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */ NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ + NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */ }; struct nfs4_state { @@ -189,7 +194,7 @@ struct nfs4_state_recovery_ops { int (*recover_lock)(struct nfs4_state *, struct file_lock *); int (*establish_clid)(struct nfs_client *, struct rpc_cred *); struct rpc_cred * (*get_clid_cred)(struct nfs_client *); - int (*reclaim_complete)(struct nfs_client *); + int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *); int (*detect_trunking)(struct nfs_client *, struct nfs_client **, struct rpc_cred *); }; @@ -231,8 +236,11 @@ extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struc extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *, struct nfs_fh *, struct nfs_fattr *); extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); -extern int nfs4_release_lockowner(struct nfs4_lock_state *); extern const struct xattr_handler *nfs4_xattr_handlers[]; +extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode); #if defined(CONFIG_NFS_V4_1) static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) @@ -295,10 +303,10 @@ is_ds_client(struct nfs_client *clp) extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; extern const u32 nfs4_fattr_bitmap[3]; -extern const u32 nfs4_statfs_bitmap[2]; -extern const u32 nfs4_pathconf_bitmap[2]; +extern const u32 nfs4_statfs_bitmap[3]; +extern const u32 nfs4_pathconf_bitmap[3]; extern const u32 nfs4_fsinfo_bitmap[3]; -extern const u32 nfs4_fs_locations_bitmap[2]; +extern const u32 nfs4_fs_locations_bitmap[3]; void nfs4_free_client(struct nfs_client *); @@ -347,13 +355,13 @@ extern int nfs4_wait_clnt_recover(struct nfs_client *clp); extern int nfs4_client_recover_expired_lease(struct nfs_client *clp); extern void nfs4_schedule_state_manager(struct nfs_client *); extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); -extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); +extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); -extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, +extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, fmode_t, const struct nfs_lockowner *); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); @@ -412,6 +420,11 @@ static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_statei return memcmp(dst, src, sizeof(*dst)) == 0; } +static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) +{ + return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; +} + #else #define nfs4_close_state(a, b) do { } while (0) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 66b6664dcd4c..90dce91dd5b5 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -66,6 +66,11 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) if (err) goto error; + if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) { + err = -EINVAL; + goto error; + } + spin_lock_init(&clp->cl_lock); INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); @@ -198,8 +203,12 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, /* Check NFS protocol revision and initialize RPC op vector */ clp->rpc_ops = &nfs_v4_clientops; + if (clp->cl_minorversion != 0) + __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); - error = nfs_create_rpc_client(clp, timeparms, authflavour); + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); + if (error == -EINVAL) + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); if (error < 0) goto error; @@ -558,14 +567,14 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr, */ struct nfs_client * nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, - struct nfs4_sessionid *sid) + struct nfs4_sessionid *sid, u32 minorversion) { struct nfs_client *clp; struct nfs_net *nn = net_generic(net, nfs_net_id); spin_lock(&nn->nfs_client_lock); list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { - if (nfs4_cb_match_client(addr, clp, 1) == false) + if (nfs4_cb_match_client(addr, clp, minorversion) == false) continue; if (!nfs4_has_session(clp)) @@ -588,7 +597,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, struct nfs_client * nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, - struct nfs4_sessionid *sid) + struct nfs4_sessionid *sid, u32 minorversion) { return NULL; } @@ -622,6 +631,8 @@ static int nfs4_set_client(struct nfs_server *server, if (server->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (server->options & NFS_OPTION_MIGRATION) + set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); @@ -726,10 +737,23 @@ static int nfs4_server_common_setup(struct nfs_server *server, return -ENOMEM; /* We must ensure the session is initialised first */ - error = nfs4_init_session(server); + error = nfs4_init_session(server->nfs_client); if (error < 0) goto out; + /* Set the basic capabilities */ + server->caps |= server->nfs_client->cl_mvops->init_caps; + if (server->flags & NFS_MOUNT_NORDIRPLUS) + server->caps &= ~NFS_CAP_READDIRPLUS; + /* + * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower + * authentication. + */ + if (nfs4_disable_idmapping && + server->client->cl_auth->au_flavor == RPC_AUTH_UNIX) + server->caps |= NFS_CAP_UIDGID_NOMAP; + + /* Probe the root fh to retrieve its FSID and filehandle */ error = nfs4_get_rootfh(server, mntfh); if (error < 0) @@ -773,9 +797,6 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; - server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK; - if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) - server->caps |= NFS_CAP_READDIRPLUS; server->options = data->options; /* Get a client record */ @@ -792,13 +813,6 @@ static int nfs4_init_server(struct nfs_server *server, if (error < 0) goto error; - /* - * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower - * authentication. - */ - if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX) - server->caps |= NFS_CAP_UIDGID_NOMAP; - if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); if (data->wsize) @@ -876,7 +890,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, /* Initialise the client representation from the parent server */ nfs_server_copy_userdata(server, parent_server); - server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; /* Get a client representation. * Note: NFSv4 always uses TCP, */ diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 13e6bb3e3fe5..e5b804dd944c 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -69,7 +69,6 @@ nfs4_file_open(struct inode *inode, struct file *filp) goto out_drop; } } - iput(inode); if (inode != dentry->d_inode) goto out_drop; diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 4fb234d3aefb..17ed87ef9de8 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -158,11 +158,14 @@ static int filelayout_async_handle_error(struct rpc_task *task, case -NFS4ERR_OPENMODE: if (state == NULL) break; - nfs4_schedule_stateid_recovery(mds_server, state); + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) + goto out_bad_stateid; goto wait_on_recovery; case -NFS4ERR_EXPIRED: - if (state != NULL) - nfs4_schedule_stateid_recovery(mds_server, state); + if (state != NULL) { + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) + goto out_bad_stateid; + } nfs4_schedule_lease_recovery(mds_client); goto wait_on_recovery; /* DS session errors */ @@ -226,6 +229,9 @@ reset: out: task->tk_status = 0; return -EAGAIN; +out_bad_stateid: + task->tk_status = -EIO; + return 0; wait_on_recovery: rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0) @@ -299,6 +305,10 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) { struct nfs_read_data *rdata = data; + if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) { + rpc_exit(task, -EIO); + return; + } if (filelayout_reset_to_mds(rdata->header->lseg)) { dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); filelayout_reset_read(rdata); @@ -307,10 +317,13 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) } rdata->read_done_cb = filelayout_read_done_cb; - nfs41_setup_sequence(rdata->ds_clp->cl_session, + if (nfs41_setup_sequence(rdata->ds_clp->cl_session, &rdata->args.seq_args, &rdata->res.seq_res, - task); + task)) + return; + nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, + rdata->args.lock_context, FMODE_READ); } static void filelayout_read_call_done(struct rpc_task *task, void *data) @@ -401,16 +414,23 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data) { struct nfs_write_data *wdata = data; + if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) { + rpc_exit(task, -EIO); + return; + } if (filelayout_reset_to_mds(wdata->header->lseg)) { dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); filelayout_reset_write(wdata); rpc_exit(task, 0); return; } - nfs41_setup_sequence(wdata->ds_clp->cl_session, + if (nfs41_setup_sequence(wdata->ds_clp->cl_session, &wdata->args.seq_args, &wdata->res.seq_res, - task); + task)) + return; + nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, + wdata->args.lock_context, FMODE_WRITE); } static void filelayout_write_call_done(struct rpc_task *task, void *data) @@ -623,7 +643,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, NFS_SERVER(lo->plh_inode)->nfs_client, id); if (d == NULL) { - dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags); + dsaddr = filelayout_get_device_info(lo->plh_inode, id, + lo->plh_lc_cred, gfp_flags); if (dsaddr == NULL) goto out; } else diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index b8da95548d3d..cebd20e7e923 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -70,6 +70,8 @@ struct nfs4_pnfs_ds { struct list_head ds_addrs; struct nfs_client *ds_clp; atomic_t ds_count; + unsigned long ds_state; +#define NFS4DS_CONNECTING 0 /* ds is establishing connection */ }; struct nfs4_file_layout_dsaddr { @@ -148,6 +150,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); struct nfs4_file_layout_dsaddr * -filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); +filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, + struct rpc_cred *cred, gfp_t gfp_flags); #endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 1fe284f01f8b..95604f64cab8 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -668,7 +668,10 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl * of available devices, and return it. */ struct nfs4_file_layout_dsaddr * -filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags) +filelayout_get_device_info(struct inode *inode, + struct nfs4_deviceid *dev_id, + struct rpc_cred *cred, + gfp_t gfp_flags) { struct pnfs_device *pdev = NULL; u32 max_resp_sz; @@ -708,8 +711,9 @@ filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gf pdev->pgbase = 0; pdev->pglen = max_resp_sz; pdev->mincount = 0; + pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; - rc = nfs4_proc_getdeviceinfo(server, pdev); + rc = nfs4_proc_getdeviceinfo(server, pdev, cred); dprintk("%s getdevice info returns %d\n", __func__, rc); if (rc) goto out_free; @@ -775,6 +779,22 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) return flseg->fh_array[i]; } +static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) +{ + might_sleep(); + wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, + nfs_wait_bit_killable, TASK_KILLABLE); +} + +static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) +{ + smp_mb__before_clear_bit(); + clear_bit(NFS4DS_CONNECTING, &ds->ds_state); + smp_mb__after_clear_bit(); + wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); +} + + struct nfs4_pnfs_ds * nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) { @@ -791,16 +811,22 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) filelayout_mark_devid_invalid(devid); return NULL; } + if (ds->ds_clp) + return ds; - if (!ds->ds_clp) { + if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); int err; err = nfs4_ds_connect(s, ds); if (err) { nfs4_mark_deviceid_unavailable(devid); - return NULL; + ds = NULL; } + nfs4_clear_ds_conn_bit(ds); + } else { + /* Either ds is connected, or ds is NULL */ + nfs4_wait_ds_connect(ds); } return ds; } diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 0dd766079e1c..cdb0b41a4810 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -134,33 +134,38 @@ static size_t nfs_parse_server_name(char *string, size_t len, return ret; } +/** + * nfs_find_best_sec - Find a security mechanism supported locally + * @flavors: List of security tuples returned by SECINFO procedure + * + * Return the pseudoflavor of the first security mechanism in + * "flavors" that is locally supported. Return RPC_AUTH_UNIX if + * no matching flavor is found in the array. The "flavors" array + * is searched in the order returned from the server, per RFC 3530 + * recommendation. + */ rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) { - struct gss_api_mech *mech; - struct xdr_netobj oid; - int i; - rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX; + rpc_authflavor_t pseudoflavor; + struct nfs4_secinfo4 *secinfo; + unsigned int i; for (i = 0; i < flavors->num_flavors; i++) { - struct nfs4_secinfo_flavor *flavor; - flavor = &flavors->flavors[i]; - - if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) { - pseudoflavor = flavor->flavor; - break; - } else if (flavor->flavor == RPC_AUTH_GSS) { - oid.len = flavor->gss.sec_oid4.len; - oid.data = flavor->gss.sec_oid4.data; - mech = gss_mech_get_by_OID(&oid); - if (!mech) - continue; - pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service); - gss_mech_put(mech); + secinfo = &flavors->flavors[i]; + + switch (secinfo->flavor) { + case RPC_AUTH_NULL: + case RPC_AUTH_UNIX: + case RPC_AUTH_GSS: + pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor, + &secinfo->flavor_info); + if (pseudoflavor != RPC_AUTH_MAXFLAVOR) + return pseudoflavor; break; } } - return pseudoflavor; + return RPC_AUTH_UNIX; } static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0ad025eb523b..cf11799297c4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -77,15 +77,68 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); -static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *); -static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); +static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label); +static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs4_state *state); + struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs4_label *olabel); #ifdef CONFIG_NFS_V4_1 -static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *); -static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *); +static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, + struct rpc_cred *); +static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *, + struct rpc_cred *); #endif + +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static inline struct nfs4_label * +nfs4_label_init_security(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, struct nfs4_label *label) +{ + int err; + + if (label == NULL) + return NULL; + + if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0) + return NULL; + + if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2) + return NULL; + + err = security_dentry_init_security(dentry, sattr->ia_mode, + &dentry->d_name, (void **)&label->label, &label->len); + if (err == 0) + return label; + + return NULL; +} +static inline void +nfs4_label_release_security(struct nfs4_label *label) +{ + if (label) + security_release_secctx(label->label, label->len); +} +static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label) +{ + if (label) + return server->attr_bitmask; + + return server->attr_bitmask_nl; +} +#else +static inline struct nfs4_label * +nfs4_label_init_security(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, struct nfs4_label *l) +{ return NULL; } +static inline void +nfs4_label_release_security(struct nfs4_label *label) +{ return; } +static inline u32 * +nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label) +{ return server->attr_bitmask; } +#endif + /* Prevent leaks of NFSv4 errors into userland */ static int nfs4_map_errors(int err) { @@ -107,6 +160,8 @@ static int nfs4_map_errors(int err) return -EPROTONOSUPPORT; case -NFS4ERR_ACCESS: return -EACCES; + case -NFS4ERR_FILE_OPEN: + return -EBUSY; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); @@ -132,7 +187,10 @@ const u32 nfs4_fattr_bitmap[3] = { | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_METADATA - | FATTR4_WORD1_TIME_MODIFY + | FATTR4_WORD1_TIME_MODIFY, +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + FATTR4_WORD2_SECURITY_LABEL +#endif }; static const u32 nfs4_pnfs_open_bitmap[3] = { @@ -159,7 +217,7 @@ static const u32 nfs4_open_noattr_bitmap[3] = { | FATTR4_WORD0_FILEID, }; -const u32 nfs4_statfs_bitmap[2] = { +const u32 nfs4_statfs_bitmap[3] = { FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL, @@ -168,7 +226,7 @@ const u32 nfs4_statfs_bitmap[2] = { | FATTR4_WORD1_SPACE_TOTAL }; -const u32 nfs4_pathconf_bitmap[2] = { +const u32 nfs4_pathconf_bitmap[3] = { FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME, 0 @@ -183,7 +241,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE FATTR4_WORD2_LAYOUT_BLKSIZE }; -const u32 nfs4_fs_locations_bitmap[2] = { +const u32 nfs4_fs_locations_bitmap[3] = { FATTR4_WORD0_TYPE | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE @@ -199,7 +257,7 @@ const u32 nfs4_fs_locations_bitmap[2] = { | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY - | FATTR4_WORD1_MOUNTED_ON_FILEID + | FATTR4_WORD1_MOUNTED_ON_FILEID, }; static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry, @@ -266,7 +324,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) *timeout = NFS4_POLL_RETRY_MIN; if (*timeout > NFS4_POLL_RETRY_MAX) *timeout = NFS4_POLL_RETRY_MAX; - freezable_schedule_timeout_killable(*timeout); + freezable_schedule_timeout_killable_unsafe(*timeout); if (fatal_signal_pending(current)) res = -ERESTARTSYS; *timeout <<= 1; @@ -295,19 +353,30 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc } if (state == NULL) break; - nfs4_schedule_stateid_recovery(server, state); + ret = nfs4_schedule_stateid_recovery(server, state); + if (ret < 0) + break; goto wait_on_recovery; case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: + if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) { + nfs_remove_bad_delegation(inode); + exception->retry = 1; + break; + } if (state == NULL) break; - nfs_remove_bad_delegation(state->inode); - nfs4_schedule_stateid_recovery(server, state); + ret = nfs4_schedule_stateid_recovery(server, state); + if (ret < 0) + break; goto wait_on_recovery; case -NFS4ERR_EXPIRED: - if (state != NULL) - nfs4_schedule_stateid_recovery(server, state); + if (state != NULL) { + ret = nfs4_schedule_stateid_recovery(server, state); + if (ret < 0) + break; + } case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); @@ -559,7 +628,7 @@ int nfs41_setup_sequence(struct nfs4_session *session, task->tk_timeout = 0; spin_lock(&tbl->slot_tbl_lock); - if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && + if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state) && !args->sa_privileged) { /* The state manager will wait until the slot table is empty */ dprintk("%s session is draining\n", __func__); @@ -749,6 +818,7 @@ struct nfs4_opendata { struct nfs4_string owner_name; struct nfs4_string group_name; struct nfs_fattr f_attr; + struct nfs4_label *f_label; struct dentry *dir; struct dentry *dentry; struct nfs4_state_owner *owner; @@ -756,14 +826,45 @@ struct nfs4_opendata { struct iattr attrs; unsigned long timestamp; unsigned int rpc_done : 1; + unsigned int is_recover : 1; int rpc_status; int cancelled; }; +static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server, + int err, struct nfs4_exception *exception) +{ + if (err != -EINVAL) + return false; + if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1)) + return false; + server->caps &= ~NFS_CAP_ATOMIC_OPEN_V1; + exception->retry = 1; + return true; +} + +static enum open_claim_type4 +nfs4_map_atomic_open_claim(struct nfs_server *server, + enum open_claim_type4 claim) +{ + if (server->caps & NFS_CAP_ATOMIC_OPEN_V1) + return claim; + switch (claim) { + default: + return claim; + case NFS4_OPEN_CLAIM_FH: + return NFS4_OPEN_CLAIM_NULL; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + return NFS4_OPEN_CLAIM_DELEGATE_CUR; + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + return NFS4_OPEN_CLAIM_DELEGATE_PREV; + } +} static void nfs4_init_opendata_res(struct nfs4_opendata *p) { p->o_res.f_attr = &p->f_attr; + p->o_res.f_label = p->f_label; p->o_res.seqid = p->o_arg.seqid; p->c_res.seqid = p->c_arg.seqid; p->o_res.server = p->o_arg.server; @@ -775,6 +876,8 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, struct nfs4_state_owner *sp, fmode_t fmode, int flags, const struct iattr *attrs, + struct nfs4_label *label, + enum open_claim_type4 claim, gfp_t gfp_mask) { struct dentry *parent = dget_parent(dentry); @@ -785,15 +888,19 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p = kzalloc(sizeof(*p), gfp_mask); if (p == NULL) goto err; + + p->f_label = nfs4_label_alloc(server, gfp_mask); + if (IS_ERR(p->f_label)) + goto err_free_p; + p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); if (p->o_arg.seqid == NULL) - goto err_free; + goto err_free_label; nfs_sb_active(dentry->d_sb); p->dentry = dget(dentry); p->dir = parent; p->owner = sp; atomic_inc(&sp->so_count); - p->o_arg.fh = NFS_FH(dir); p->o_arg.open_flags = flags; p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS @@ -809,9 +916,22 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.id.uniquifier = sp->so_seqid.owner_id; p->o_arg.name = &dentry->d_name; p->o_arg.server = server; - p->o_arg.bitmask = server->attr_bitmask; + p->o_arg.bitmask = nfs4_bitmask(server, label); p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; - p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; + p->o_arg.label = label; + p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); + switch (p->o_arg.claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + p->o_arg.fh = NFS_FH(dir); + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + p->o_arg.fh = NFS_FH(dentry->d_inode); + } if (attrs != NULL && attrs->ia_valid != 0) { __be32 verf[2]; @@ -829,7 +949,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, nfs4_init_opendata_res(p); kref_init(&p->kref); return p; -err_free: + +err_free_label: + nfs4_label_free(p->f_label); +err_free_p: kfree(p); err: dput(parent); @@ -846,6 +969,9 @@ static void nfs4_opendata_free(struct kref *kref) if (p->state != NULL) nfs4_put_open_state(p->state); nfs4_put_state_owner(p->owner); + + nfs4_label_free(p->f_label); + dput(p->dir); dput(p->dentry); nfs_sb_deactive(sb); @@ -924,6 +1050,7 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid * if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); + set_bit(NFS_OPEN_STATE, &state->flags); switch (fmode) { case FMODE_READ: set_bit(NFS_O_RDONLY_STATE, &state->flags); @@ -1022,7 +1149,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) struct nfs4_state *state = opendata->state; struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *delegation; - int open_mode = opendata->o_arg.open_flags & (O_EXCL|O_TRUNC); + int open_mode = opendata->o_arg.open_flags; fmode_t fmode = opendata->o_arg.fmode; nfs4_stateid stateid; int ret = -EAGAIN; @@ -1047,9 +1174,11 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) nfs4_stateid_copy(&stateid, &delegation->stateid); rcu_read_unlock(); nfs_release_seqid(opendata->o_arg.seqid); - ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); - if (ret != 0) - goto out; + if (!opendata->is_recover) { + ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); + if (ret != 0) + goto out; + } ret = -EAGAIN; /* Try to update the stateid using the delegation */ @@ -1121,6 +1250,8 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) if (ret) goto err; + nfs_setsecurity(inode, &data->f_attr, data->f_label); + if (data->o_res.delegation_type != 0) nfs4_opendata_check_deleg(data, state); update_open_stateid(state, &data->o_res.stateid, NULL, @@ -1147,7 +1278,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) ret = -EAGAIN; if (!(data->f_attr.valid & NFS_ATTR_FATTR)) goto err; - inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr); + inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label); ret = PTR_ERR(inode); if (IS_ERR(inode)) goto err; @@ -1194,11 +1325,13 @@ static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state * return ERR_PTR(-ENOENT); } -static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, struct nfs4_state *state) +static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, + struct nfs4_state *state, enum open_claim_type4 claim) { struct nfs4_opendata *opendata; - opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, NULL, GFP_NOFS); + opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, + NULL, NULL, claim, GFP_NOFS); if (opendata == NULL) return ERR_PTR(-ENOMEM); opendata->state = state; @@ -1234,6 +1367,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * /* memory barrier prior to reading state->n_* */ clear_bit(NFS_DELEGATED_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); smp_rmb(); if (state->n_rdwr != 0) { clear_bit(NFS_O_RDWR_STATE, &state->flags); @@ -1284,11 +1418,10 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state fmode_t delegation_type = 0; int status; - opendata = nfs4_open_recoverdata_alloc(ctx, state); + opendata = nfs4_open_recoverdata_alloc(ctx, state, + NFS4_OPEN_CLAIM_PREVIOUS); if (IS_ERR(opendata)) return PTR_ERR(opendata); - opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS; - opendata->o_arg.fh = NFS_FH(state->inode); rcu_read_lock(); delegation = rcu_dereference(NFS_I(state->inode)->delegation); if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0) @@ -1307,6 +1440,8 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state int err; do { err = _nfs4_do_open_reclaim(ctx, state); + if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) + continue; if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -1321,71 +1456,72 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta ctx = nfs4_state_find_open_context(state); if (IS_ERR(ctx)) - return PTR_ERR(ctx); + return -EAGAIN; ret = nfs4_do_open_reclaim(ctx, state); put_nfs_open_context(ctx); return ret; } -static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err) { - struct nfs4_opendata *opendata; - int ret; - - opendata = nfs4_open_recoverdata_alloc(ctx, state); - if (IS_ERR(opendata)) - return PTR_ERR(opendata); - opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; - nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid); - ret = nfs4_open_recover(opendata, state); - nfs4_opendata_put(opendata); - return ret; + switch (err) { + default: + printk(KERN_ERR "NFS: %s: unhandled error " + "%d.\n", __func__, err); + case 0: + case -ENOENT: + case -ESTALE: + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + set_bit(NFS_DELEGATED_STATE, &state->flags); + nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); + return -EAGAIN; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_STALE_STATEID: + set_bit(NFS_DELEGATED_STATE, &state->flags); + case -NFS4ERR_EXPIRED: + /* Don't recall a delegation if it was lost */ + nfs4_schedule_lease_recovery(server->nfs_client); + return -EAGAIN; + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OPENMODE: + nfs_inode_find_state_and_recover(state->inode, + stateid); + nfs4_schedule_stateid_recovery(server, state); + return 0; + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + set_bit(NFS_DELEGATED_STATE, &state->flags); + ssleep(1); + return -EAGAIN; + case -ENOMEM: + case -NFS4ERR_DENIED: + /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + return 0; + } + return err; } int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) { - struct nfs4_exception exception = { }; struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_opendata *opendata; int err; - do { - err = _nfs4_open_delegation_recall(ctx, state, stateid); - switch (err) { - case 0: - case -ENOENT: - case -ESTALE: - goto out; - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - case -NFS4ERR_DEADSESSION: - set_bit(NFS_DELEGATED_STATE, &state->flags); - nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); - err = -EAGAIN; - goto out; - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_STALE_STATEID: - set_bit(NFS_DELEGATED_STATE, &state->flags); - case -NFS4ERR_EXPIRED: - /* Don't recall a delegation if it was lost */ - nfs4_schedule_lease_recovery(server->nfs_client); - err = -EAGAIN; - goto out; - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_BAD_STATEID: - nfs_inode_find_state_and_recover(state->inode, - stateid); - nfs4_schedule_stateid_recovery(server, state); - case -ENOMEM: - err = 0; - goto out; - } - set_bit(NFS_DELEGATED_STATE, &state->flags); - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); -out: - return err; + + opendata = nfs4_open_recoverdata_alloc(ctx, state, + NFS4_OPEN_CLAIM_DELEG_CUR_FH); + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid); + err = nfs4_open_recover(opendata, state); + nfs4_opendata_put(opendata); + return nfs4_handle_delegation_recall_error(server, state, stateid, err); } static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) @@ -1468,6 +1604,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; struct nfs4_state_owner *sp = data->owner; + struct nfs_client *clp = sp->so_server->nfs_client; if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) goto out_wait; @@ -1483,15 +1620,20 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) rcu_read_lock(); delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR && + data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH && can_open_delegated(delegation, data->o_arg.fmode)) goto unlock_no_action; rcu_read_unlock(); } /* Update client id. */ - data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; - if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { - task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; + data->o_arg.clientid = clp->cl_clientid; + switch (data->o_arg.claim) { + case NFS4_OPEN_CLAIM_PREVIOUS: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0]; + case NFS4_OPEN_CLAIM_FH: + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); } data->timestamp = jiffies; @@ -1500,6 +1642,16 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) &data->o_res.seq_res, task) != 0) nfs_release_seqid(data->o_arg.seqid); + + /* Set the create mode (note dependency on the session type) */ + data->o_arg.createmode = NFS4_CREATE_UNCHECKED; + if (data->o_arg.open_flags & O_EXCL) { + data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE; + if (nfs4_has_persistent_session(clp)) + data->o_arg.createmode = NFS4_CREATE_GUARDED; + else if (clp->cl_mvops->minor_version > 0) + data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1; + } return; unlock_no_action: rcu_read_unlock(); @@ -1595,8 +1747,11 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) data->rpc_done = 0; data->rpc_status = 0; data->cancelled = 0; - if (isrecover) + data->is_recover = 0; + if (isrecover) { nfs4_set_sequence_privileged(&o_arg->seq_args); + data->is_recover = 1; + } task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -1702,7 +1857,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return status; } if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) - _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr); + _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); return 0; } @@ -1721,7 +1876,8 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s struct nfs4_opendata *opendata; int ret; - opendata = nfs4_open_recoverdata_alloc(ctx, state); + opendata = nfs4_open_recoverdata_alloc(ctx, state, + NFS4_OPEN_CLAIM_FH); if (IS_ERR(opendata)) return PTR_ERR(opendata); ret = nfs4_open_recover(opendata, state); @@ -1739,6 +1895,8 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state do { err = _nfs4_open_expired(ctx, state); + if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) + continue; switch (err) { default: goto out; @@ -1759,7 +1917,7 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta ctx = nfs4_state_find_open_context(state); if (IS_ERR(ctx)) - return PTR_ERR(ctx); + return -EAGAIN; ret = nfs4_do_open_expired(ctx, state); put_nfs_open_context(ctx); return ret; @@ -1770,18 +1928,30 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); nfs4_stateid *stateid = &state->stateid; - int status; + struct nfs_delegation *delegation; + struct rpc_cred *cred = NULL; + int status = -NFS4ERR_BAD_STATEID; /* If a state reset has been done, test_stateid is unneeded */ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) return; - status = nfs41_test_stateid(server, stateid); + /* Get the delegation credential for use by test/free_stateid */ + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(state->inode)->delegation); + if (delegation != NULL && + nfs4_stateid_match(&delegation->stateid, stateid)) { + cred = get_rpccred(delegation->cred); + rcu_read_unlock(); + status = nfs41_test_stateid(server, stateid, cred); + } else + rcu_read_unlock(); + if (status != NFS_OK) { /* Free the stateid unless the server explicitly * informs us the stateid is unrecognized. */ if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, stateid); + nfs41_free_stateid(server, stateid, cred); nfs_remove_bad_delegation(state->inode); write_seqlock(&state->seqlock); @@ -1789,6 +1959,9 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state) write_sequnlock(&state->seqlock); clear_bit(NFS_DELEGATED_STATE, &state->flags); } + + if (cred != NULL) + put_rpccred(cred); } /** @@ -1803,6 +1976,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); nfs4_stateid *stateid = &state->open_stateid; + struct rpc_cred *cred = state->owner->so_cred; int status; /* If a state reset has been done, test_stateid is unneeded */ @@ -1811,16 +1985,17 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0)) return -NFS4ERR_BAD_STATEID; - status = nfs41_test_stateid(server, stateid); + status = nfs41_test_stateid(server, stateid, cred); if (status != NFS_OK) { /* Free the stateid unless the server explicitly * informs us the stateid is unrecognized. */ if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, stateid); + nfs41_free_stateid(server, stateid, cred); clear_bit(NFS_O_RDONLY_STATE, &state->flags); clear_bit(NFS_O_WRONLY_STATE, &state->flags); clear_bit(NFS_O_RDWR_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); } return status; } @@ -1856,10 +2031,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, fmode_t fmode, int flags, - struct nfs4_state **res) + struct nfs_open_context *ctx) { struct nfs4_state_owner *sp = opendata->owner; struct nfs_server *server = sp->so_server; + struct dentry *dentry; struct nfs4_state *state; unsigned int seq; int ret; @@ -1877,15 +2053,31 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + dentry = opendata->dentry; + if (dentry->d_inode == NULL) { + /* FIXME: Is this d_drop() ever needed? */ + d_drop(dentry); + dentry = d_add_unique(dentry, igrab(state->inode)); + if (dentry == NULL) { + dentry = opendata->dentry; + } else if (dentry != ctx->dentry) { + dput(ctx->dentry); + ctx->dentry = dget(dentry); + } + nfs_set_verifier(dentry, + nfs_save_change_attribute(opendata->dir->d_inode)); + } + ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); if (ret != 0) goto out; - if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) { - nfs4_schedule_stateid_recovery(server, state); - nfs4_wait_clnt_recover(server->nfs_client); + ctx->state = state; + if (dentry->d_inode == state->inode) { + nfs_inode_attach_open_context(ctx); + if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) + nfs4_schedule_stateid_recovery(server, state); } - *res = state; out: return ret; } @@ -1894,18 +2086,21 @@ out: * Returns a referenced nfs4_state */ static int _nfs4_do_open(struct inode *dir, - struct dentry *dentry, - fmode_t fmode, + struct nfs_open_context *ctx, int flags, struct iattr *sattr, - struct rpc_cred *cred, - struct nfs4_state **res, - struct nfs4_threshold **ctx_th) + struct nfs4_label *label) { struct nfs4_state_owner *sp; struct nfs4_state *state = NULL; struct nfs_server *server = NFS_SERVER(dir); struct nfs4_opendata *opendata; + struct dentry *dentry = ctx->dentry; + struct rpc_cred *cred = ctx->cred; + struct nfs4_threshold **ctx_th = &ctx->mdsthreshold; + fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC); + enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL; + struct nfs4_label *olabel = NULL; int status; /* Protect against reboot recovery conflicts */ @@ -1921,33 +2116,48 @@ static int _nfs4_do_open(struct inode *dir, if (dentry->d_inode != NULL) nfs4_return_incompatible_delegation(dentry->d_inode, fmode); status = -ENOMEM; - opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, GFP_KERNEL); + if (dentry->d_inode) + claim = NFS4_OPEN_CLAIM_FH; + opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, + label, claim, GFP_KERNEL); if (opendata == NULL) goto err_put_state_owner; + if (label) { + olabel = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(olabel)) { + status = PTR_ERR(olabel); + goto err_opendata_put; + } + } + if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); if (!opendata->f_attr.mdsthreshold) - goto err_opendata_put; + goto err_free_label; opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; } if (dentry->d_inode != NULL) opendata->state = nfs4_get_open_state(dentry->d_inode, sp); - status = _nfs4_open_and_get_state(opendata, fmode, flags, &state); + status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx); if (status != 0) - goto err_opendata_put; + goto err_free_label; + state = ctx->state; - if (opendata->o_arg.open_flags & O_EXCL) { + if ((opendata->o_arg.open_flags & O_EXCL) && + (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { nfs4_exclusive_attrset(opendata, sattr); nfs_fattr_init(opendata->o_res.f_attr); status = nfs4_do_setattr(state->inode, cred, opendata->o_res.f_attr, sattr, - state); - if (status == 0) + state, label, olabel); + if (status == 0) { nfs_setattr_update_inode(state->inode, sattr); - nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); + nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); + nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); + } } if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) @@ -1956,37 +2166,37 @@ static int _nfs4_do_open(struct inode *dir, kfree(opendata->f_attr.mdsthreshold); opendata->f_attr.mdsthreshold = NULL; + nfs4_label_free(olabel); + nfs4_opendata_put(opendata); nfs4_put_state_owner(sp); - *res = state; return 0; +err_free_label: + nfs4_label_free(olabel); err_opendata_put: kfree(opendata->f_attr.mdsthreshold); nfs4_opendata_put(opendata); err_put_state_owner: nfs4_put_state_owner(sp); out_err: - *res = NULL; return status; } static struct nfs4_state *nfs4_do_open(struct inode *dir, - struct dentry *dentry, - fmode_t fmode, + struct nfs_open_context *ctx, int flags, struct iattr *sattr, - struct rpc_cred *cred, - struct nfs4_threshold **ctx_th) + struct nfs4_label *label) { + struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { }; struct nfs4_state *res; int status; - fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC; do { - status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, - &res, ctx_th); + status = _nfs4_do_open(dir, ctx, flags, sattr, label); + res = ctx->state; if (status == 0) break; /* NOTE: BAD_SEQID means the server and client disagree about the @@ -2022,7 +2232,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, exception.retry = 1; continue; } - res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), + if (nfs4_clear_cap_atomic_open_v1(server, status, &exception)) + continue; + res = ERR_PTR(nfs4_handle_exception(server, status, &exception)); } while (exception.retry); return res; @@ -2030,7 +2242,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs4_state *state) + struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs4_label *olabel) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_setattrargs arg = { @@ -2038,9 +2251,11 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, .iap = sattr, .server = server, .bitmask = server->attr_bitmask, + .label = ilabel, }; struct nfs_setattrres res = { .fattr = fattr, + .label = olabel, .server = server, }; struct rpc_message msg = { @@ -2050,20 +2265,29 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, .rpc_cred = cred, }; unsigned long timestamp = jiffies; + fmode_t fmode; + bool truncate; int status; + arg.bitmask = nfs4_bitmask(server, ilabel); + if (ilabel) + arg.bitmask = nfs4_bitmask(server, olabel); + nfs_fattr_init(fattr); - if (state != NULL) { + /* Servers should only apply open mode checks for file size changes */ + truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; + fmode = truncate ? FMODE_WRITE : FMODE_READ; + + if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) { + /* Use that stateid */ + } else if (truncate && state != NULL && nfs4_valid_open_stateid(state)) { struct nfs_lockowner lockowner = { .l_owner = current->files, .l_pid = current->tgid, }; nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, &lockowner); - } else if (nfs4_copy_delegation_stateid(&arg.stateid, inode, - FMODE_WRITE)) { - /* Use that stateid */ } else nfs4_stateid_copy(&arg.stateid, &zero_stateid); @@ -2075,7 +2299,8 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs4_state *state) + struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs4_label *olabel) { struct nfs_server *server = NFS_SERVER(inode); struct nfs4_exception exception = { @@ -2084,9 +2309,16 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, }; int err; do { - err = _nfs4_do_setattr(inode, cred, fattr, sattr, state); + err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); switch (err) { case -NFS4ERR_OPENMODE: + if (!(sattr->ia_valid & ATTR_SIZE)) { + pr_warn_once("NFSv4: server %s is incorrectly " + "applying open mode checks to " + "a SETATTR that is not " + "changing file size.\n", + server->nfs_client->cl_hostname); + } if (state && !(state->state & FMODE_WRITE)) { err = -EBADF; if (sattr->ia_valid & ATTR_OPEN) @@ -2130,11 +2362,19 @@ static void nfs4_close_clear_stateid_flags(struct nfs4_state *state, fmode_t fmode) { spin_lock(&state->owner->so_lock); - if (!(fmode & FMODE_READ)) + clear_bit(NFS_O_RDWR_STATE, &state->flags); + switch (fmode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_WRITE: clear_bit(NFS_O_RDONLY_STATE, &state->flags); - if (!(fmode & FMODE_WRITE)) + break; + case FMODE_READ: clear_bit(NFS_O_WRONLY_STATE, &state->flags); - clear_bit(NFS_O_RDWR_STATE, &state->flags); + break; + case 0: + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); + } spin_unlock(&state->owner->so_lock); } @@ -2202,6 +2442,8 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) calldata->arg.fmode &= ~FMODE_WRITE; } } + if (!nfs4_valid_open_stateid(state)) + call_close = 0; spin_unlock(&state->owner->so_lock); if (!call_close) { @@ -2212,8 +2454,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (calldata->arg.fmode == 0) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; if (calldata->roc && - pnfs_roc_drain(inode, &calldata->roc_barrier, task)) + pnfs_roc_drain(inode, &calldata->roc_barrier, task)) { + nfs_release_seqid(calldata->arg.seqid); goto out_wait; + } } nfs_fattr_init(calldata->res.fattr); @@ -2310,14 +2554,18 @@ static struct inode * nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) { struct nfs4_state *state; + struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL; + + label = nfs4_label_init_security(dir, ctx->dentry, attr, &l); /* Protect against concurrent sillydeletes */ - state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr, - ctx->cred, &ctx->mdsthreshold); + state = nfs4_do_open(dir, ctx, open_flags, attr, label); + + nfs4_label_release_security(label); + if (IS_ERR(state)) return ERR_CAST(state); - ctx->state = state; - return igrab(state->inode); + return state->inode; } static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) @@ -2373,7 +2621,17 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->caps |= NFS_CAP_CTIME; if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY) server->caps |= NFS_CAP_MTIME; +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL) + server->caps |= NFS_CAP_SECURITY_LABEL; +#endif + memcpy(server->attr_bitmask_nl, res.attr_bitmask, + sizeof(server->attr_bitmask)); + if (server->caps & NFS_CAP_SECURITY_LABEL) { + server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + } memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; @@ -2399,8 +2657,9 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { + u32 bitmask[3]; struct nfs4_lookup_root_arg args = { - .bitmask = nfs4_fattr_bitmap, + .bitmask = bitmask, }; struct nfs4_lookup_res res = { .server = server, @@ -2413,6 +2672,13 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; + bitmask[0] = nfs4_fattr_bitmap[0]; + bitmask[1] = nfs4_fattr_bitmap[1]; + /* + * Process the label in the upcoming getfattr + */ + bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL; + nfs_fattr_init(info->fattr); return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } @@ -2444,7 +2710,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl auth = rpcauth_create(flavor, server->client); if (IS_ERR(auth)) { - ret = -EIO; + ret = -EACCES; goto out; } ret = nfs4_lookup_root(server, fhandle, info); @@ -2452,27 +2718,36 @@ out: return ret; } +/* + * Retry pseudoroot lookup with various security flavors. We do this when: + * + * NFSv4.0: the PUTROOTFH operation returns NFS4ERR_WRONGSEC + * NFSv4.1: the server does not support the SECINFO_NO_NAME operation + * + * Returns zero on success, or a negative NFS4ERR value, or a + * negative errno value. + */ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { - int i, len, status = 0; - rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS]; - - len = rpcauth_list_flavors(flav_array, ARRAY_SIZE(flav_array)); - if (len < 0) - return len; - - for (i = 0; i < len; i++) { - /* AUTH_UNIX is the default flavor if none was specified, - * thus has already been tried. */ - if (flav_array[i] == RPC_AUTH_UNIX) - continue; + /* Per 3530bis 15.33.5 */ + static const rpc_authflavor_t flav_array[] = { + RPC_AUTH_GSS_KRB5P, + RPC_AUTH_GSS_KRB5I, + RPC_AUTH_GSS_KRB5, + RPC_AUTH_UNIX, /* courtesy */ + RPC_AUTH_NULL, + }; + int status = -EPERM; + size_t i; + for (i = 0; i < ARRAY_SIZE(flav_array); i++) { status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]); if (status == -NFS4ERR_WRONGSEC || status == -EACCES) continue; break; } + /* * -EACCESS could mean that the user doesn't have correct permissions * to access the mount. It could also mean that we tried to mount @@ -2485,24 +2760,36 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, return status; } -/* - * get the file handle for the "/" directory on the server +static int nfs4_do_find_root_sec(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs_fsinfo *info) +{ + int mv = server->nfs_client->cl_minorversion; + return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info); +} + +/** + * nfs4_proc_get_rootfh - get file handle for server's pseudoroot + * @server: initialized nfs_server handle + * @fhandle: we fill in the pseudo-fs root file handle + * @info: we fill in an FSINFO struct + * + * Returns zero on success, or a negative errno. */ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { - int minor_version = server->nfs_client->cl_minorversion; - int status = nfs4_lookup_root(server, fhandle, info); - if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR)) - /* - * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM - * by nfs4_map_errors() as this function exits. - */ - status = nfs_v4_minor_ops[minor_version]->find_root_sec(server, fhandle, info); + int status; + + status = nfs4_lookup_root(server, fhandle, info); + if ((status == -NFS4ERR_WRONGSEC) && + !(server->flags & NFS_MOUNT_SECFLAVOUR)) + status = nfs4_do_find_root_sec(server, fhandle, info); + if (status == 0) status = nfs4_server_capabilities(server, fhandle); if (status == 0) status = nfs4_do_fsinfo(server, fhandle, info); + return nfs4_map_errors(status); } @@ -2511,6 +2798,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh, { int error; struct nfs_fattr *fattr = info->fattr; + struct nfs4_label *label = NULL; error = nfs4_server_capabilities(server, mntfh); if (error < 0) { @@ -2518,16 +2806,23 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh, return error; } - error = nfs4_proc_getattr(server, mntfh, fattr); + label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(label)) + return PTR_ERR(label); + + error = nfs4_proc_getattr(server, mntfh, fattr, label); if (error < 0) { dprintk("nfs4_get_root: getattr error = %d\n", -error); - return error; + goto err_free_label; } if (fattr->valid & NFS_ATTR_FATTR_FSID && !nfs_fsid_equal(&server->fsid, &fattr->fsid)) memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); +err_free_label: + nfs4_label_free(label); + return error; } @@ -2574,7 +2869,8 @@ out: return status; } -static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs4_getattr_arg args = { .fh = fhandle, @@ -2582,6 +2878,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, }; struct nfs4_getattr_res res = { .fattr = fattr, + .label = label, .server = server, }; struct rpc_message msg = { @@ -2589,18 +2886,21 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_argp = &args, .rpc_resp = &res, }; - + + args.bitmask = nfs4_bitmask(server, label); + nfs_fattr_init(fattr); return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } -static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs4_exception exception = { }; int err; do { err = nfs4_handle_exception(server, - _nfs4_proc_getattr(server, fhandle, fattr), + _nfs4_proc_getattr(server, fhandle, fattr, label), &exception); } while (exception.retry); return err; @@ -2630,6 +2930,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct inode *inode = dentry->d_inode; struct rpc_cred *cred = NULL; struct nfs4_state *state = NULL; + struct nfs4_label *label = NULL; int status; if (pnfs_ld_layoutret_on_setattr(inode)) @@ -2656,15 +2957,22 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } } - status = nfs4_do_setattr(inode, cred, fattr, sattr, state); - if (status == 0) + label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); + if (IS_ERR(label)) + return PTR_ERR(label); + + status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label); + if (status == 0) { nfs_setattr_update_inode(inode, sattr); + nfs_setsecurity(inode, fattr, label); + } + nfs4_label_free(label); return status; } static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs_server *server = NFS_SERVER(dir); int status; @@ -2676,6 +2984,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct nfs4_lookup_res res = { .server = server, .fattr = fattr, + .label = label, .fh = fhandle, }; struct rpc_message msg = { @@ -2684,6 +2993,8 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, .rpc_resp = &res, }; + args.bitmask = nfs4_bitmask(server, label); + nfs_fattr_init(fattr); dprintk("NFS call lookup %s\n", name->name); @@ -2702,13 +3013,13 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr) static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs4_exception exception = { }; struct rpc_clnt *client = *clnt; int err; do { - err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr); + err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label); switch (err) { case -NFS4ERR_BADNAME: err = -ENOENT; @@ -2742,12 +3053,13 @@ out: } static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + struct nfs4_label *label) { int status; struct rpc_clnt *client = NFS_CLIENT(dir); - status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr); + status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label); if (client != NFS_CLIENT(dir)) { rpc_shutdown_client(client); nfs_fixup_secinfo_attributes(fattr); @@ -2762,7 +3074,7 @@ nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name, int status; struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir)); - status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr); + status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL); if (status < 0) { rpc_shutdown_client(client); return ERR_PTR(status); @@ -2787,7 +3099,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry .rpc_cred = entry->cred, }; int mode = entry->mask; - int status; + int status = 0; /* * Determine which access bits we want to ask for... @@ -2892,6 +3204,7 @@ static int nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags) { + struct nfs4_label l, *ilabel = NULL; struct nfs_open_context *ctx; struct nfs4_state *state; int status = 0; @@ -2900,19 +3213,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, if (IS_ERR(ctx)) return PTR_ERR(ctx); + ilabel = nfs4_label_init_security(dir, dentry, sattr, &l); + sattr->ia_mode &= ~current_umask(); - state = nfs4_do_open(dir, dentry, ctx->mode, - flags, sattr, ctx->cred, - &ctx->mdsthreshold); - d_drop(dentry); + state = nfs4_do_open(dir, ctx, flags, sattr, ilabel); if (IS_ERR(state)) { status = PTR_ERR(state); goto out; } - d_add(dentry, igrab(state->inode)); - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - ctx->state = state; out: + nfs4_label_release_security(ilabel); put_nfs_open_context(ctx); return status; } @@ -2961,6 +3271,8 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) res->server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; nfs41_init_sequence(&args->seq_args, &res->seq_res, 1); + + nfs_fattr_init(res->dir_attr); } static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) @@ -3036,7 +3348,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, .rpc_resp = &res, }; int status = -ENOMEM; - + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(old_dir, &res.old_cinfo); @@ -3070,6 +3382,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr * }; struct nfs4_link_res res = { .server = server, + .label = NULL, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], @@ -3082,11 +3395,24 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr * if (res.fattr == NULL) goto out; + res.label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(res.label)) { + status = PTR_ERR(res.label); + goto out; + } + arg.bitmask = nfs4_bitmask(server, res.label); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(dir, &res.cinfo); - nfs_post_op_update_inode(inode, res.fattr); + status = nfs_post_op_update_inode(inode, res.fattr); + if (!status) + nfs_setsecurity(inode, res.fattr, res.label); } + + + nfs4_label_free(res.label); + out: nfs_free_fattr(res.fattr); return status; @@ -3110,6 +3436,7 @@ struct nfs4_createdata { struct nfs4_create_res res; struct nfs_fh fh; struct nfs_fattr fattr; + struct nfs4_label *label; }; static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, @@ -3121,6 +3448,10 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, if (data != NULL) { struct nfs_server *server = NFS_SERVER(dir); + data->label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(data->label)) + goto out_free; + data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE]; data->msg.rpc_argp = &data->arg; data->msg.rpc_resp = &data->res; @@ -3129,13 +3460,17 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, data->arg.name = name; data->arg.attrs = sattr; data->arg.ftype = ftype; - data->arg.bitmask = server->attr_bitmask; + data->arg.bitmask = nfs4_bitmask(server, data->label); data->res.server = server; data->res.fh = &data->fh; data->res.fattr = &data->fattr; + data->res.label = data->label; nfs_fattr_init(data->res.fattr); } return data; +out_free: + kfree(data); + return NULL; } static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) @@ -3144,18 +3479,20 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { update_changeattr(dir, &data->res.dir_cinfo); - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label); } return status; } static void nfs4_free_createdata(struct nfs4_createdata *data) { + nfs4_label_free(data->label); kfree(data); } static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, - struct page *page, unsigned int len, struct iattr *sattr) + struct page *page, unsigned int len, struct iattr *sattr, + struct nfs4_label *label) { struct nfs4_createdata *data; int status = -ENAMETOOLONG; @@ -3171,6 +3508,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK]; data->arg.u.symlink.pages = &page; data->arg.u.symlink.len = len; + data->arg.label = label; status = nfs4_do_create(dir, dentry, data); @@ -3183,18 +3521,24 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { struct nfs4_exception exception = { }; + struct nfs4_label l, *label = NULL; int err; + + label = nfs4_label_init_security(dir, dentry, sattr, &l); + do { err = nfs4_handle_exception(NFS_SERVER(dir), _nfs4_proc_symlink(dir, dentry, page, - len, sattr), + len, sattr, label), &exception); } while (exception.retry); + + nfs4_label_release_security(label); return err; } static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, - struct iattr *sattr) + struct iattr *sattr, struct nfs4_label *label) { struct nfs4_createdata *data; int status = -ENOMEM; @@ -3203,6 +3547,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, if (data == NULL) goto out; + data->arg.label = label; status = nfs4_do_create(dir, dentry, data); nfs4_free_createdata(data); @@ -3214,14 +3559,19 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct nfs4_exception exception = { }; + struct nfs4_label l, *label = NULL; int err; + label = nfs4_label_init_security(dir, dentry, sattr, &l); + sattr->ia_mode &= ~current_umask(); do { err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_mkdir(dir, dentry, sattr), + _nfs4_proc_mkdir(dir, dentry, sattr, label), &exception); } while (exception.retry); + nfs4_label_release_security(label); + return err; } @@ -3279,7 +3629,7 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, } static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, - struct iattr *sattr, dev_t rdev) + struct iattr *sattr, struct nfs4_label *label, dev_t rdev) { struct nfs4_createdata *data; int mode = sattr->ia_mode; @@ -3304,7 +3654,8 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, status = -EINVAL; goto out_free; } - + + data->arg.label = label; status = nfs4_do_create(dir, dentry, data); out_free: nfs4_free_createdata(data); @@ -3316,14 +3667,20 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { struct nfs4_exception exception = { }; + struct nfs4_label l, *label = NULL; int err; + label = nfs4_label_init_security(dir, dentry, sattr, &l); + sattr->ia_mode &= ~current_umask(); do { err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_mknod(dir, dentry, sattr, rdev), + _nfs4_proc_mknod(dir, dentry, sattr, label, rdev), &exception); } while (exception.retry); + + nfs4_label_release_security(label); + return err; } @@ -3381,12 +3738,21 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) { struct nfs4_exception exception = { }; + unsigned long now = jiffies; int err; do { - err = nfs4_handle_exception(server, - _nfs4_do_fsinfo(server, fhandle, fsinfo), - &exception); + err = _nfs4_do_fsinfo(server, fhandle, fsinfo); + if (err == 0) { + struct nfs_client *clp = server->nfs_client; + + spin_lock(&clp->cl_lock); + clp->cl_lease_time = fsinfo->lease_time * HZ; + clp->cl_last_renewal = now; + spin_unlock(&clp->cl_lock); + break; + } + err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); return err; } @@ -3446,6 +3812,46 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return err; } +int nfs4_set_rw_stateid(nfs4_stateid *stateid, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode) +{ + const struct nfs_lockowner *lockowner = NULL; + + if (l_ctx != NULL) + lockowner = &l_ctx->lockowner; + return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner); +} +EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid); + +static bool nfs4_stateid_is_current(nfs4_stateid *stateid, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode) +{ + nfs4_stateid current_stateid; + + if (nfs4_set_rw_stateid(¤t_stateid, ctx, l_ctx, fmode)) + return false; + return nfs4_stateid_match(stateid, ¤t_stateid); +} + +static bool nfs4_error_stateid_expired(int err) +{ + switch (err) { + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_OPENMODE: + case -NFS4ERR_EXPIRED: + return true; + } + return false; +} + void __nfs4_read_done_cb(struct nfs_read_data *data) { nfs_invalidate_atime(data->header->inode); @@ -3466,6 +3872,20 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) return 0; } +static bool nfs4_read_stateid_changed(struct rpc_task *task, + struct nfs_readargs *args) +{ + + if (!nfs4_error_stateid_expired(task->tk_status) || + nfs4_stateid_is_current(&args->stateid, + args->context, + args->lock_context, + FMODE_READ)) + return false; + rpc_restart_call_prepare(task); + return true; +} + static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) { @@ -3473,7 +3893,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) if (!nfs4_sequence_done(task, &data->res.seq_res)) return -EAGAIN; - + if (nfs4_read_stateid_changed(task, &data->args)) + return -EAGAIN; return data->read_done_cb ? data->read_done_cb(task, data) : nfs4_read_done_cb(task, data); } @@ -3488,10 +3909,13 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) { - nfs4_setup_sequence(NFS_SERVER(data->header->inode), + if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), &data->args.seq_args, &data->res.seq_res, - task); + task)) + return; + nfs4_set_rw_stateid(&data->args.stateid, data->args.context, + data->args.lock_context, FMODE_READ); } static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) @@ -3509,10 +3933,26 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data return 0; } +static bool nfs4_write_stateid_changed(struct rpc_task *task, + struct nfs_writeargs *args) +{ + + if (!nfs4_error_stateid_expired(task->tk_status) || + nfs4_stateid_is_current(&args->stateid, + args->context, + args->lock_context, + FMODE_WRITE)) + return false; + rpc_restart_call_prepare(task); + return true; +} + static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) { if (!nfs4_sequence_done(task, &data->res.seq_res)) return -EAGAIN; + if (nfs4_write_stateid_changed(task, &data->args)) + return -EAGAIN; return data->write_done_cb ? data->write_done_cb(task, data) : nfs4_write_done_cb(task, data); } @@ -3552,10 +3992,13 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) { - nfs4_setup_sequence(NFS_SERVER(data->header->inode), + if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), &data->args.seq_args, &data->res.seq_res, - task); + task)) + return; + nfs4_set_rw_stateid(&data->args.stateid, data->args.context, + data->args.lock_context, FMODE_WRITE); } static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) @@ -3657,7 +4100,7 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, return -ENOMEM; data->client = clp; data->timestamp = jiffies; - return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, + return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT, &nfs4_renew_ops, data); } @@ -3671,7 +4114,7 @@ static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) unsigned long now = jiffies; int status; - status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (status < 0) return status; do_renew_lease(clp, now); @@ -3964,6 +4407,155 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen return err; } +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static int _nfs4_get_security_label(struct inode *inode, void *buf, + size_t buflen) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_fattr fattr; + struct nfs4_label label = {0, 0, buflen, buf}; + + u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; + struct nfs4_getattr_arg args = { + .fh = NFS_FH(inode), + .bitmask = bitmask, + }; + struct nfs4_getattr_res res = { + .fattr = &fattr, + .label = &label, + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int ret; + + nfs_fattr_init(&fattr); + + ret = rpc_call_sync(server->client, &msg, 0); + if (ret) + return ret; + if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL)) + return -ENOENT; + if (buflen < label.len) + return -ERANGE; + return 0; +} + +static int nfs4_get_security_label(struct inode *inode, void *buf, + size_t buflen) +{ + struct nfs4_exception exception = { }; + int err; + + if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) + return -EOPNOTSUPP; + + do { + err = nfs4_handle_exception(NFS_SERVER(inode), + _nfs4_get_security_label(inode, buf, buflen), + &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_do_set_security_label(struct inode *inode, + struct nfs4_label *ilabel, + struct nfs_fattr *fattr, + struct nfs4_label *olabel) +{ + + struct iattr sattr = {0}; + struct nfs_server *server = NFS_SERVER(inode); + const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; + struct nfs_setattrargs args = { + .fh = NFS_FH(inode), + .iap = &sattr, + .server = server, + .bitmask = bitmask, + .label = ilabel, + }; + struct nfs_setattrres res = { + .fattr = fattr, + .label = olabel, + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + nfs4_stateid_copy(&args.stateid, &zero_stateid); + + status = rpc_call_sync(server->client, &msg, 0); + if (status) + dprintk("%s failed: %d\n", __func__, status); + + return status; +} + +static int nfs4_do_set_security_label(struct inode *inode, + struct nfs4_label *ilabel, + struct nfs_fattr *fattr, + struct nfs4_label *olabel) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(NFS_SERVER(inode), + _nfs4_do_set_security_label(inode, ilabel, + fattr, olabel), + &exception); + } while (exception.retry); + return err; +} + +static int +nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen) +{ + struct nfs4_label ilabel, *olabel = NULL; + struct nfs_fattr fattr; + struct rpc_cred *cred; + struct inode *inode = dentry->d_inode; + int status; + + if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) + return -EOPNOTSUPP; + + nfs_fattr_init(&fattr); + + ilabel.pi = 0; + ilabel.lfs = 0; + ilabel.label = (char *)buf; + ilabel.len = buflen; + + cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return PTR_ERR(cred); + + olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); + if (IS_ERR(olabel)) { + status = -PTR_ERR(olabel); + goto out; + } + + status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel); + if (status == 0) + nfs_setsecurity(inode, &fattr, olabel); + + nfs4_label_free(olabel); +out: + put_rpccred(cred); + return status; +} +#endif /* CONFIG_NFS_V4_SECURITY_LABEL */ + + static int nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) { @@ -3981,11 +4573,14 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_OPENMODE: if (state == NULL) break; - nfs4_schedule_stateid_recovery(server, state); + if (nfs4_schedule_stateid_recovery(server, state) < 0) + goto stateid_invalid; goto wait_on_recovery; case -NFS4ERR_EXPIRED: - if (state != NULL) - nfs4_schedule_stateid_recovery(server, state); + if (state != NULL) { + if (nfs4_schedule_stateid_recovery(server, state) < 0) + goto stateid_invalid; + } case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); @@ -4017,6 +4612,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, } task->tk_status = nfs4_map_errors(task->tk_status); return 0; +stateid_invalid: + task->tk_status = -EIO; + return 0; wait_on_recovery: rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) @@ -4116,7 +4714,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, /* cb_client4 */ rcu_read_lock(); setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, - sizeof(setclientid.sc_netid), + sizeof(setclientid.sc_netid), "%s", rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_NETID)); rcu_read_unlock(); @@ -4144,27 +4742,17 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct nfs4_setclientid_res *arg, struct rpc_cred *cred) { - struct nfs_fsinfo fsinfo; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], .rpc_argp = arg, - .rpc_resp = &fsinfo, .rpc_cred = cred, }; - unsigned long now; int status; dprintk("NFS call setclientid_confirm auth=%s, (client ID %llx)\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_clientid); - now = jiffies; status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); - if (status == 0) { - spin_lock(&clp->cl_lock); - clp->cl_lease_time = fsinfo.lease_time * HZ; - clp->cl_last_renewal = now; - spin_unlock(&clp->cl_lock); - } dprintk("NFS reply setclientid_confirm: %d\n", status); return status; } @@ -4309,7 +4897,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 static unsigned long nfs4_set_lock_task_retry(unsigned long timeout) { - freezable_schedule_timeout_killable(timeout); + freezable_schedule_timeout_killable_unsafe(timeout); timeout <<= 1; if (timeout > NFS4_LOCK_MAXTIMEOUT) return NFS4_LOCK_MAXTIMEOUT; @@ -4547,9 +5135,9 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * if (status != 0) goto out; /* Is this a delegated lock? */ - if (test_bit(NFS_DELEGATED_STATE, &state->flags)) - goto out; lsp = request->fl_u.nfs4_fl.owner; + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0) + goto out; seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL); status = -ENOMEM; if (seqid == NULL) @@ -4628,17 +5216,23 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { goto out_release_lock_seqid; } - data->arg.open_stateid = &state->stateid; + data->arg.open_stateid = &state->open_stateid; data->arg.new_lock_owner = 1; data->res.open_seqid = data->arg.open_seqid; } else data->arg.new_lock_owner = 0; + if (!nfs4_valid_open_stateid(state)) { + data->rpc_status = -EBADF; + task->tk_action = NULL; + goto out_release_open_seqid; + } data->timestamp = jiffies; if (nfs4_setup_sequence(data->server, &data->arg.seq_args, &data->res.seq_res, task) == 0) return; +out_release_open_seqid: nfs_release_seqid(data->arg.open_seqid); out_release_lock_seqid: nfs_release_seqid(data->arg.lock_seqid); @@ -4831,13 +5425,18 @@ static int nfs41_check_expired_locks(struct nfs4_state *state) list_for_each_entry(lsp, &state->lock_states, ls_locks) { if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { - status = nfs41_test_stateid(server, &lsp->ls_stateid); + struct rpc_cred *cred = lsp->ls_state->owner->so_cred; + + status = nfs41_test_stateid(server, + &lsp->ls_stateid, + cred); if (status != NFS_OK) { /* Free the stateid unless the server * informs us the stateid is unrecognized. */ if (status != -NFS4ERR_BAD_STATEID) nfs41_free_stateid(server, - &lsp->ls_stateid); + &lsp->ls_stateid, + cred); clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); ret = status; } @@ -4984,58 +5583,16 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) return status; } -int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) +int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid) { struct nfs_server *server = NFS_SERVER(state->inode); - struct nfs4_exception exception = { }; int err; err = nfs4_set_lock_state(state, fl); if (err != 0) - goto out; - do { - err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); - switch (err) { - default: - printk(KERN_ERR "NFS: %s: unhandled error " - "%d.\n", __func__, err); - case 0: - case -ESTALE: - goto out; - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_STALE_STATEID: - set_bit(NFS_DELEGATED_STATE, &state->flags); - case -NFS4ERR_EXPIRED: - nfs4_schedule_lease_recovery(server->nfs_client); - err = -EAGAIN; - goto out; - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - case -NFS4ERR_DEADSESSION: - set_bit(NFS_DELEGATED_STATE, &state->flags); - nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); - err = -EAGAIN; - goto out; - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OPENMODE: - nfs4_schedule_stateid_recovery(server, state); - err = 0; - goto out; - case -ENOMEM: - case -NFS4ERR_DENIED: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ - err = 0; - goto out; - } - set_bit(NFS_DELEGATED_STATE, &state->flags); - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); -out: - return err; + return err; + err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); + return nfs4_handle_delegation_recall_error(server, state, stateid, err); } struct nfs_release_lockowner_data { @@ -5055,9 +5612,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = { .rpc_release = nfs4_release_lockowner_release, }; -int nfs4_release_lockowner(struct nfs4_lock_state *lsp) +static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) { - struct nfs_server *server = lsp->ls_state->owner->so_server; struct nfs_release_lockowner_data *data; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], @@ -5113,6 +5669,53 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, return len; } +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static inline int nfs4_server_supports_labels(struct nfs_server *server) +{ + return server->caps & NFS_CAP_SECURITY_LABEL; +} + +static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key, + const void *buf, size_t buflen, + int flags, int type) +{ + if (security_ismaclabel(key)) + return nfs4_set_security_label(dentry, buf, buflen); + + return -EOPNOTSUPP; +} + +static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key, + void *buf, size_t buflen, int type) +{ + if (security_ismaclabel(key)) + return nfs4_get_security_label(dentry->d_inode, buf, buflen); + return -EOPNOTSUPP; +} + +static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list, + size_t list_len, const char *name, + size_t name_len, int type) +{ + size_t len = 0; + + if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) { + len = security_inode_listsecurity(dentry->d_inode, NULL, 0); + if (list && len <= list_len) + security_inode_listsecurity(dentry->d_inode, list, len); + } + return len; +} + +static const struct xattr_handler nfs4_xattr_nfs4_label_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = nfs4_xattr_list_nfs4_label, + .get = nfs4_xattr_get_nfs4_label, + .set = nfs4_xattr_set_nfs4_label, +}; +#endif + + /* * nfs_fhget will use either the mounted_on_fileid or the fileid */ @@ -5136,7 +5739,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, struct page *page) { struct nfs_server *server = NFS_SERVER(dir); - u32 bitmask[2] = { + u32 bitmask[3] = { [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, }; struct nfs4_fs_locations_arg args = { @@ -5323,7 +5926,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) struct nfs41_exchange_id_args args = { .verifier = &verifier, .client = clp, - .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER, + .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID, }; struct nfs41_exchange_id_res res = { 0 @@ -5580,17 +6184,14 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) */ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) { - struct nfs4_session *session = args->client->cl_session; - unsigned int mxrqst_sz = session->fc_target_max_rqst_sz, - mxresp_sz = session->fc_target_max_resp_sz; + unsigned int max_rqst_sz, max_resp_sz; + + max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead; + max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead; - if (mxrqst_sz == 0) - mxrqst_sz = NFS_MAX_FILE_IO_SIZE; - if (mxresp_sz == 0) - mxresp_sz = NFS_MAX_FILE_IO_SIZE; /* Fore channel attributes */ - args->fc_attrs.max_rqst_sz = mxrqst_sz; - args->fc_attrs.max_resp_sz = mxresp_sz; + args->fc_attrs.max_rqst_sz = max_rqst_sz; + args->fc_attrs.max_resp_sz = max_resp_sz; args->fc_attrs.max_ops = NFS4_MAX_OPS; args->fc_attrs.max_reqs = max_session_slots; @@ -5849,7 +6450,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs41_sequence_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, }; if (!atomic_inc_not_zero(&clp->cl_count)) @@ -5977,12 +6578,14 @@ static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = { /* * Issue a global reclaim complete. */ -static int nfs41_proc_reclaim_complete(struct nfs_client *clp) +static int nfs41_proc_reclaim_complete(struct nfs_client *clp, + struct rpc_cred *cred) { struct nfs4_reclaim_complete_data *calldata; struct rpc_task *task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], + .rpc_cred = cred, }; struct rpc_task_setup task_setup_data = { .rpc_client = clp->cl_rpcclient, @@ -6166,6 +6769,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], .rpc_argp = &lgp->args, .rpc_resp = &lgp->res, + .rpc_cred = lgp->cred, }; struct rpc_task_setup task_setup_data = { .rpc_client = server->client, @@ -6269,6 +6873,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], .rpc_argp = &lrp->args, .rpc_resp = &lrp->res, + .rpc_cred = lrp->cred, }; struct rpc_task_setup task_setup_data = { .rpc_client = lrp->clp->cl_rpcclient, @@ -6338,7 +6943,9 @@ int nfs4_proc_getdevicelist(struct nfs_server *server, EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); static int -_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) +_nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *pdev, + struct rpc_cred *cred) { struct nfs4_getdeviceinfo_args args = { .pdev = pdev, @@ -6350,6 +6957,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], .rpc_argp = &args, .rpc_resp = &res, + .rpc_cred = cred, }; int status; @@ -6360,14 +6968,16 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) return status; } -int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) +int nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *pdev, + struct rpc_cred *cred) { struct nfs4_exception exception = { }; int err; do { err = nfs4_handle_exception(server, - _nfs4_proc_getdeviceinfo(server, pdev), + _nfs4_proc_getdeviceinfo(server, pdev, cred), &exception); } while (exception.retry); return err; @@ -6551,7 +7161,9 @@ out: return err; } -static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) +static int _nfs41_test_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) { int status; struct nfs41_test_stateid_args args = { @@ -6562,6 +7174,7 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID], .rpc_argp = &args, .rpc_resp = &res, + .rpc_cred = cred, }; dprintk("NFS call test_stateid %p\n", stateid); @@ -6582,17 +7195,20 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) * * @server: server / transport on which to perform the operation * @stateid: state ID to test + * @cred: credential * * Returns NFS_OK if the server recognizes that "stateid" is valid. * Otherwise a negative NFS4ERR value is returned if the operation * failed or the state ID is not currently valid. */ -static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) +static int nfs41_test_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) { struct nfs4_exception exception = { }; int err; do { - err = _nfs41_test_stateid(server, stateid); + err = _nfs41_test_stateid(server, stateid, cred); if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -6600,26 +7216,78 @@ static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) return err; } -static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) -{ - struct nfs41_free_stateid_args args = { - .stateid = stateid, - }; +struct nfs_free_stateid_data { + struct nfs_server *server; + struct nfs41_free_stateid_args args; struct nfs41_free_stateid_res res; +}; + +static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_free_stateid_data *data = calldata; + nfs41_setup_sequence(nfs4_get_session(data->server), + &data->args.seq_args, + &data->res.seq_res, + task); +} + +static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata) +{ + struct nfs_free_stateid_data *data = calldata; + + nfs41_sequence_done(task, &data->res.seq_res); + + switch (task->tk_status) { + case -NFS4ERR_DELAY: + if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } +} + +static void nfs41_free_stateid_release(void *calldata) +{ + kfree(calldata); +} + +const struct rpc_call_ops nfs41_free_stateid_ops = { + .rpc_call_prepare = nfs41_free_stateid_prepare, + .rpc_call_done = nfs41_free_stateid_done, + .rpc_release = nfs41_free_stateid_release, +}; + +static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred, + bool privileged) +{ struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID], - .rpc_argp = &args, - .rpc_resp = &res, + .rpc_cred = cred, }; - int status; + struct rpc_task_setup task_setup = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs41_free_stateid_ops, + .flags = RPC_TASK_ASYNC, + }; + struct nfs_free_stateid_data *data; dprintk("NFS call free_stateid %p\n", stateid); - nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); - nfs4_set_sequence_privileged(&args.seq_args); - status = nfs4_call_sync_sequence(server->client, server, &msg, - &args.seq_args, &res.seq_res); - dprintk("NFS reply free_stateid: %d\n", status); - return status; + data = kmalloc(sizeof(*data), GFP_NOFS); + if (!data) + return ERR_PTR(-ENOMEM); + data->server = server; + nfs4_stateid_copy(&data->args.stateid, stateid); + + task_setup.callback_data = data; + + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + if (privileged) + nfs4_set_sequence_privileged(&data->args.seq_args); + + return rpc_run_task(&task_setup); } /** @@ -6627,21 +7295,39 @@ static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) * * @server: server / transport on which to perform the operation * @stateid: state ID to release + * @cred: credential * * Returns NFS_OK if the server freed "stateid". Otherwise a * negative NFS4ERR value is returned. */ -static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) +static int nfs41_free_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) { - struct nfs4_exception exception = { }; - int err; - do { - err = _nfs4_free_stateid(server, stateid); - if (err != -NFS4ERR_DELAY) - break; - nfs4_handle_exception(server, err, &exception); - } while (exception.retry); - return err; + struct rpc_task *task; + int ret; + + task = _nfs41_free_stateid(server, stateid, cred, true); + if (IS_ERR(task)) + return PTR_ERR(task); + ret = rpc_wait_for_completion_task(task); + if (!ret) + ret = task->tk_status; + rpc_put_task(task); + return ret; +} + +static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) +{ + struct rpc_task *task; + struct rpc_cred *cred = lsp->ls_state->owner->so_cred; + + task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); + nfs4_free_lock_state(server, lsp); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; } static bool nfs41_match_stateid(const nfs4_stateid *s1, @@ -6726,9 +7412,14 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, + .init_caps = NFS_CAP_READDIRPLUS + | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_CHANGE_ATTR + | NFS_CAP_POSIX_LOCK, .call_sync = _nfs4_call_sync, .match_stateid = nfs4_match_stateid, .find_root_sec = nfs4_find_root_sec, + .free_lock_state = nfs4_release_lockowner, .reboot_recovery_ops = &nfs40_reboot_recovery_ops, .nograce_recovery_ops = &nfs40_nograce_recovery_ops, .state_renewal_ops = &nfs40_state_renewal_ops, @@ -6737,9 +7428,35 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { #if defined(CONFIG_NFS_V4_1) static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, + .init_caps = NFS_CAP_READDIRPLUS + | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_CHANGE_ATTR + | NFS_CAP_POSIX_LOCK + | NFS_CAP_STATEID_NFSV41 + | NFS_CAP_ATOMIC_OPEN_V1, + .call_sync = nfs4_call_sync_sequence, + .match_stateid = nfs41_match_stateid, + .find_root_sec = nfs41_find_root_sec, + .free_lock_state = nfs41_free_lock_state, + .reboot_recovery_ops = &nfs41_reboot_recovery_ops, + .nograce_recovery_ops = &nfs41_nograce_recovery_ops, + .state_renewal_ops = &nfs41_state_renewal_ops, +}; +#endif + +#if defined(CONFIG_NFS_V4_2) +static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { + .minor_version = 2, + .init_caps = NFS_CAP_READDIRPLUS + | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_CHANGE_ATTR + | NFS_CAP_POSIX_LOCK + | NFS_CAP_STATEID_NFSV41 + | NFS_CAP_ATOMIC_OPEN_V1, .call_sync = nfs4_call_sync_sequence, .match_stateid = nfs41_match_stateid, .find_root_sec = nfs41_find_root_sec, + .free_lock_state = nfs41_free_lock_state, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, @@ -6751,6 +7468,9 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { #if defined(CONFIG_NFS_V4_1) [1] = &nfs_v4_1_minor_ops, #endif +#if defined(CONFIG_NFS_V4_2) + [2] = &nfs_v4_2_minor_ops, +#endif }; const struct inode_operations nfs4_dir_inode_operations = { @@ -6850,6 +7570,9 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { const struct xattr_handler *nfs4_xattr_handlers[] = { &nfs4_xattr_nfs4_acl_handler, +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + &nfs4_xattr_nfs4_label_handler, +#endif NULL }; diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index ebda5f4a031b..36e21cb29d65 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -73,7 +73,7 @@ void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot) tbl->highest_used_slotid = new_max; else { tbl->highest_used_slotid = NFS4_NO_SLOT; - nfs4_session_drain_complete(tbl->session, tbl); + nfs4_slot_tbl_drain_complete(tbl); } } dprintk("%s: slotid %u highest_used_slotid %d\n", __func__, @@ -226,7 +226,7 @@ static bool nfs41_assign_slot(struct rpc_task *task, void *pslot) struct nfs4_slot *slot = pslot; struct nfs4_slot_table *tbl = slot->table; - if (nfs4_session_draining(tbl->session) && !args->sa_privileged) + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) return false; slot->generation = tbl->generation; args->sa_slot = slot; @@ -478,48 +478,12 @@ static int nfs41_check_session_ready(struct nfs_client *clp) return 0; } -int nfs4_init_session(struct nfs_server *server) +int nfs4_init_session(struct nfs_client *clp) { - struct nfs_client *clp = server->nfs_client; - struct nfs4_session *session; - unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE; - unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE; - if (!nfs4_has_session(clp)) return 0; - if (server->rsize != 0) - target_max_resp_sz = server->rsize; - target_max_resp_sz += nfs41_maxread_overhead; - - if (server->wsize != 0) - target_max_rqst_sz = server->wsize; - target_max_rqst_sz += nfs41_maxwrite_overhead; - - session = clp->cl_session; - spin_lock(&clp->cl_lock); - if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { - /* Initialise targets and channel attributes */ - session->fc_target_max_rqst_sz = target_max_rqst_sz; - session->fc_attrs.max_rqst_sz = target_max_rqst_sz; - session->fc_target_max_resp_sz = target_max_resp_sz; - session->fc_attrs.max_resp_sz = target_max_resp_sz; - } else { - /* Just adjust the targets */ - if (target_max_rqst_sz > session->fc_target_max_rqst_sz) { - session->fc_target_max_rqst_sz = target_max_rqst_sz; - set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); - } - if (target_max_resp_sz > session->fc_target_max_resp_sz) { - session->fc_target_max_resp_sz = target_max_resp_sz; - set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); - } - } - spin_unlock(&clp->cl_lock); - - if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) - nfs4_schedule_lease_recovery(clp); - + clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state); return nfs41_check_session_ready(clp); } diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 6f3cb39386d4..3a153d82b90c 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -25,6 +25,10 @@ struct nfs4_slot { }; /* Sessions */ +enum nfs4_slot_tbl_state { + NFS4_SLOT_TBL_DRAINING, +}; + #define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long)) struct nfs4_slot_table { struct nfs4_session *session; /* Parent session */ @@ -43,6 +47,7 @@ struct nfs4_slot_table { unsigned long generation; /* Generation counter for target_highest_slotid */ struct completion complete; + unsigned long slot_tbl_state; }; /* @@ -61,14 +66,10 @@ struct nfs4_session { struct nfs4_channel_attrs bc_attrs; struct nfs4_slot_table bc_slot_table; struct nfs_client *clp; - /* Create session arguments */ - unsigned int fc_target_max_rqst_sz; - unsigned int fc_target_max_resp_sz; }; enum nfs4_session_state { NFS4_SESSION_INITING, - NFS4_SESSION_DRAINING, }; #if defined(CONFIG_NFS_V4_1) @@ -85,15 +86,14 @@ extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses); extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); extern void nfs4_destroy_session(struct nfs4_session *session); -extern int nfs4_init_session(struct nfs_server *server); +extern int nfs4_init_session(struct nfs_client *clp); extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); -extern void nfs4_session_drain_complete(struct nfs4_session *session, - struct nfs4_slot_table *tbl); +extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); -static inline bool nfs4_session_draining(struct nfs4_session *session) +static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl) { - return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state); + return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); } bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, @@ -119,7 +119,7 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp) #else /* defined(CONFIG_NFS_V4_1) */ -static inline int nfs4_init_session(struct nfs_server *server) +static inline int nfs4_init_session(struct nfs_client *clp) { return 0; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index d41a3518509f..e22862f13564 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -154,18 +154,6 @@ struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp) return cred; } -static void nfs4_clear_machine_cred(struct nfs_client *clp) -{ - struct rpc_cred *cred; - - spin_lock(&clp->cl_lock); - cred = clp->cl_machine_cred; - clp->cl_machine_cred = NULL; - spin_unlock(&clp->cl_lock); - if (cred != NULL) - put_rpccred(cred); -} - static struct rpc_cred * nfs4_get_renew_cred_server_locked(struct nfs_server *server) { @@ -240,38 +228,37 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp) return status; } -/* - * Back channel returns NFS4ERR_DELAY for new requests when - * NFS4_SESSION_DRAINING is set so there is no work to be done when draining - * is ended. - */ -static void nfs4_end_drain_session(struct nfs_client *clp) +static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl) { - struct nfs4_session *ses = clp->cl_session; - struct nfs4_slot_table *tbl; - - if (ses == NULL) - return; - tbl = &ses->fc_slot_table; - if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { spin_lock(&tbl->slot_tbl_lock); nfs41_wake_slot_table(tbl); spin_unlock(&tbl->slot_tbl_lock); } } +static void nfs4_end_drain_session(struct nfs_client *clp) +{ + struct nfs4_session *ses = clp->cl_session; + + if (ses != NULL) { + nfs4_end_drain_slot_table(&ses->bc_slot_table); + nfs4_end_drain_slot_table(&ses->fc_slot_table); + } +} + /* * Signal state manager thread if session fore channel is drained */ -void nfs4_session_drain_complete(struct nfs4_session *session, - struct nfs4_slot_table *tbl) +void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl) { - if (nfs4_session_draining(session)) + if (nfs4_slot_tbl_draining(tbl)) complete(&tbl->complete); } -static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl) +static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl) { + set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); spin_lock(&tbl->slot_tbl_lock); if (tbl->highest_used_slotid != NFS4_NO_SLOT) { INIT_COMPLETION(tbl->complete); @@ -287,13 +274,12 @@ static int nfs4_begin_drain_session(struct nfs_client *clp) struct nfs4_session *ses = clp->cl_session; int ret = 0; - set_bit(NFS4_SESSION_DRAINING, &ses->session_state); /* back channel */ - ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table); + ret = nfs4_drain_slot_tbl(&ses->bc_slot_table); if (ret) return ret; /* fore channel */ - return nfs4_wait_on_slot_tbl(&ses->fc_slot_table); + return nfs4_drain_slot_tbl(&ses->fc_slot_table); } static void nfs41_finish_session_reset(struct nfs_client *clp) @@ -699,6 +685,8 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner) list_for_each_entry(state, &nfsi->open_states, inode_states) { if (state->owner != owner) continue; + if (!nfs4_valid_open_stateid(state)) + continue; if (atomic_inc_not_zero(&state->count)) return state; } @@ -931,6 +919,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ */ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) { + struct nfs_server *server; struct nfs4_state *state; if (lsp == NULL) @@ -942,11 +931,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) if (list_empty(&state->lock_states)) clear_bit(LK_STATE_IN_USE, &state->flags); spin_unlock(&state->state_lock); + server = state->owner->so_server; if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { - if (nfs4_release_lockowner(lsp) == 0) - return; - } - nfs4_free_lock_state(lsp->ls_state->owner->so_server, lsp); + struct nfs_client *clp = server->nfs_client; + + clp->cl_mvops->free_lock_state(server, lsp); + } else + nfs4_free_lock_state(server, lsp); } static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) @@ -987,13 +978,14 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) return 0; } -static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state, +static int nfs4_copy_lock_stateid(nfs4_stateid *dst, + struct nfs4_state *state, const struct nfs_lockowner *lockowner) { struct nfs4_lock_state *lsp; fl_owner_t fl_owner; pid_t fl_pid; - bool ret = false; + int ret = -ENOENT; if (lockowner == NULL) @@ -1008,7 +1000,10 @@ static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state, lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { nfs4_stateid_copy(dst, &lsp->ls_stateid); - ret = true; + ret = 0; + smp_rmb(); + if (!list_empty(&lsp->ls_seqid.list)) + ret = -EWOULDBLOCK; } spin_unlock(&state->state_lock); nfs4_put_lock_state(lsp); @@ -1016,28 +1011,44 @@ out: return ret; } -static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) { + const nfs4_stateid *src; + int ret; int seq; do { + src = &zero_stateid; seq = read_seqbegin(&state->seqlock); - nfs4_stateid_copy(dst, &state->stateid); + if (test_bit(NFS_OPEN_STATE, &state->flags)) + src = &state->open_stateid; + nfs4_stateid_copy(dst, src); + ret = 0; + smp_rmb(); + if (!list_empty(&state->owner->so_seqid.list)) + ret = -EWOULDBLOCK; } while (read_seqretry(&state->seqlock, seq)); + return ret; } /* * Byte-range lock aware utility to initialize the stateid of read/write * requests. */ -void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, +int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, fmode_t fmode, const struct nfs_lockowner *lockowner) { + int ret = 0; if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) - return; - if (nfs4_copy_lock_stateid(dst, state, lockowner)) - return; - nfs4_copy_open_stateid(dst, state); + goto out; + ret = nfs4_copy_lock_stateid(dst, state, lockowner); + if (ret != -ENOENT) + goto out; + ret = nfs4_copy_open_stateid(dst, state); +out: + if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41)) + dst->seqid = 0; + return ret; } struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask) @@ -1182,7 +1193,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) snprintf(buf, sizeof(buf), "%s-manager", rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); rcu_read_unlock(); - task = kthread_run(nfs4_run_state_manager, clp, buf); + task = kthread_run(nfs4_run_state_manager, clp, "%s", buf); if (IS_ERR(task)) { printk(KERN_ERR "%s: kthread_run: %ld\n", __func__, PTR_ERR(task)); @@ -1286,14 +1297,17 @@ static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_s return 1; } -void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state) +int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state) { struct nfs_client *clp = server->nfs_client; + if (!nfs4_valid_open_stateid(state)) + return -EBADF; nfs4_state_mark_reclaim_nograce(clp, state); dprintk("%s: scheduling stateid recovery for server %s\n", __func__, clp->cl_hostname); nfs4_schedule_state_manager(clp); + return 0; } EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); @@ -1323,6 +1337,27 @@ void nfs_inode_find_state_and_recover(struct inode *inode, nfs4_schedule_state_manager(clp); } +static void nfs4_state_mark_open_context_bad(struct nfs4_state *state) +{ + struct inode *inode = state->inode; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *ctx; + + spin_lock(&inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + if (ctx->state != state) + continue; + set_bit(NFS_CONTEXT_BAD, &ctx->flags); + } + spin_unlock(&inode->i_lock); +} + +static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error) +{ + set_bit(NFS_STATE_RECOVERY_FAILED, &state->flags); + nfs4_state_mark_open_context_bad(state); +} + static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) { @@ -1337,13 +1372,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ /* Guard against delegation returns and new lock/unlock calls */ down_write(&nfsi->rwsem); /* Protect inode->i_flock using the BKL */ - lock_flocks(); + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) continue; if (nfs_file_open_context(fl->fl_file)->state != state) continue; - unlock_flocks(); + spin_unlock(&inode->i_lock); status = ops->recover_lock(state, fl); switch (status) { case 0: @@ -1370,9 +1405,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ /* kill_proc(fl->fl_pid, SIGLOST, 1); */ status = 0; } - lock_flocks(); + spin_lock(&inode->i_lock); } - unlock_flocks(); + spin_unlock(&inode->i_lock); out: up_write(&nfsi->rwsem); return status; @@ -1398,6 +1433,8 @@ restart: list_for_each_entry(state, &sp->so_states, open_states) { if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) continue; + if (!nfs4_valid_open_stateid(state)) + continue; if (state->state == 0) continue; atomic_inc(&state->count); @@ -1430,11 +1467,10 @@ restart: * Open state on this file cannot be recovered * All we can do is revert to using the zero stateid. */ - memset(&state->stateid, 0, - sizeof(state->stateid)); - /* Mark the file as being 'closed' */ - state->state = 0; + nfs4_state_mark_recovery_failed(state, status); break; + case -EAGAIN: + ssleep(1); case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_BAD_STATEID: @@ -1526,11 +1562,12 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) } static void nfs4_reclaim_complete(struct nfs_client *clp, - const struct nfs4_state_recovery_ops *ops) + const struct nfs4_state_recovery_ops *ops, + struct rpc_cred *cred) { /* Notify the server we're done reclaiming our state */ if (ops->reclaim_complete) - (void)ops->reclaim_complete(clp); + (void)ops->reclaim_complete(clp, cred); } static void nfs4_clear_reclaim_server(struct nfs_server *server) @@ -1575,9 +1612,15 @@ static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp) static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) { + const struct nfs4_state_recovery_ops *ops; + struct rpc_cred *cred; + if (!nfs4_state_clear_reclaim_reboot(clp)) return; - nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); + ops = clp->cl_mvops->reboot_recovery_ops; + cred = ops->get_clid_cred(clp); + nfs4_reclaim_complete(clp, ops, cred); + put_rpccred(cred); } static void nfs_delegation_clear_all(struct nfs_client *clp) @@ -1696,6 +1739,10 @@ static int nfs4_check_lease(struct nfs_client *clp) } status = ops->renew_lease(clp, cred); put_rpccred(cred); + if (status == -ETIMEDOUT) { + set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + return 0; + } out: return nfs4_recovery_handle_error(clp, status); } @@ -1725,10 +1772,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); return -EPERM; case -EACCES: - if (clp->cl_machine_cred == NULL) - return -EACCES; - /* Handle case where the user hasn't set up machine creds */ - nfs4_clear_machine_cred(clp); case -NFS4ERR_DELAY: case -ETIMEDOUT: case -EAGAIN: @@ -1823,31 +1866,18 @@ int nfs4_discover_server_trunking(struct nfs_client *clp, { const struct nfs4_state_recovery_ops *ops = clp->cl_mvops->reboot_recovery_ops; - rpc_authflavor_t *flavors, flav, save; struct rpc_clnt *clnt; struct rpc_cred *cred; - int i, len, status; + int i, status; dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname); - len = NFS_MAX_SECFLAVORS; - flavors = kcalloc(len, sizeof(*flavors), GFP_KERNEL); - if (flavors == NULL) { - status = -ENOMEM; - goto out; - } - len = rpcauth_list_flavors(flavors, len); - if (len < 0) { - status = len; - goto out_free; - } clnt = clp->cl_rpcclient; - save = clnt->cl_auth->au_flavor; i = 0; mutex_lock(&nfs_clid_init_mutex); - status = -ENOENT; again: + status = -ENOENT; cred = ops->get_clid_cred(clp); if (cred == NULL) goto out_unlock; @@ -1857,12 +1887,6 @@ again: switch (status) { case 0: break; - - case -EACCES: - if (clp->cl_machine_cred == NULL) - break; - /* Handle case where the user hasn't set up machine creds */ - nfs4_clear_machine_cred(clp); case -NFS4ERR_DELAY: case -ETIMEDOUT: case -EAGAIN: @@ -1871,17 +1895,12 @@ again: dprintk("NFS: %s after status %d, retrying\n", __func__, status); goto again; - + case -EACCES: + if (i++) + break; case -NFS4ERR_CLID_INUSE: case -NFS4ERR_WRONGSEC: - status = -EPERM; - if (i >= len) - break; - - flav = flavors[i++]; - if (flav == save) - flav = flavors[i++]; - clnt = rpc_clone_client_set_auth(clnt, flav); + clnt = rpc_clone_client_set_auth(clnt, RPC_AUTH_UNIX); if (IS_ERR(clnt)) { status = PTR_ERR(clnt); break; @@ -1903,13 +1922,15 @@ again: case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery * in nfs4_exchange_id */ status = -EKEYEXPIRED; + break; + default: + pr_warn("NFS: %s unhandled error %d. Exiting with error EIO\n", + __func__, status); + status = -EIO; } out_unlock: mutex_unlock(&nfs_clid_init_mutex); -out_free: - kfree(flavors); -out: dprintk("NFS: %s: status = %d\n", __func__, status); return status; } diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 569b166cc050..5dbe2d269210 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -9,6 +9,7 @@ #include "delegation.h" #include "internal.h" #include "nfs4_fs.h" +#include "dns_resolve.h" #include "pnfs.h" #include "nfs.h" @@ -252,6 +253,8 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name, dfprintk(MOUNT, "--> nfs4_try_mount()\n"); + if (data->auth_flavors[0] == RPC_AUTH_MAXFLAVOR) + data->auth_flavors[0] = RPC_AUTH_UNIX; export_path = data->nfs_server.export_path; data->nfs_server.export_path = "/"; root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info, @@ -329,18 +332,24 @@ static int __init init_nfs_v4(void) { int err; - err = nfs_idmap_init(); + err = nfs_dns_resolver_init(); if (err) goto out; - err = nfs4_register_sysctl(); + err = nfs_idmap_init(); if (err) goto out1; + err = nfs4_register_sysctl(); + if (err) + goto out2; + register_nfs_version(&nfs_v4); return 0; -out1: +out2: nfs_idmap_quit(); +out1: + nfs_dns_resolver_destroy(); out: return err; } @@ -350,6 +359,7 @@ static void __exit exit_nfs_v4(void) unregister_nfs_version(&nfs_v4); nfs4_unregister_sysctl(); nfs_idmap_quit(); + nfs_dns_resolver_destroy(); } MODULE_LICENSE("GPL"); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e3edda554ac7..0abfb8466e79 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -102,12 +102,23 @@ static int nfs4_stat_to_errno(int); #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) #define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) #define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */ +#define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN)) +#define encode_readdir_space 24 +#define encode_readdir_bitmask_sz 3 +#else +#define nfs4_label_maxsz 0 +#define encode_readdir_space 20 +#define encode_readdir_bitmask_sz 2 +#endif /* We support only one layout type per file system */ #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) /* This is based on getfattr, which uses the most attributes: */ #define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ 3 + 3 + 3 + nfs4_owner_maxsz + \ - nfs4_group_maxsz + decode_mdsthreshold_maxsz)) + nfs4_group_maxsz + nfs4_label_maxsz + \ + decode_mdsthreshold_maxsz)) #define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ nfs4_fattr_value_maxsz) #define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz) @@ -115,6 +126,7 @@ static int nfs4_stat_to_errno(int); 1 + 2 + 1 + \ nfs4_owner_maxsz + \ nfs4_group_maxsz + \ + nfs4_label_maxsz + \ 4 + 4) #define encode_savefh_maxsz (op_encode_hdr_maxsz) #define decode_savefh_maxsz (op_decode_hdr_maxsz) @@ -192,9 +204,11 @@ static int nfs4_stat_to_errno(int); encode_stateid_maxsz + 3) #define decode_read_maxsz (op_decode_hdr_maxsz + 2) #define encode_readdir_maxsz (op_encode_hdr_maxsz + \ - 2 + encode_verifier_maxsz + 5) + 2 + encode_verifier_maxsz + 5 + \ + nfs4_label_maxsz) #define decode_readdir_maxsz (op_decode_hdr_maxsz + \ - decode_verifier_maxsz) + decode_verifier_maxsz + \ + nfs4_label_maxsz + nfs4_fattr_maxsz) #define encode_readlink_maxsz (op_encode_hdr_maxsz) #define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) #define encode_write_maxsz (op_encode_hdr_maxsz + \ @@ -530,14 +544,10 @@ static int nfs4_stat_to_errno(int); decode_setclientid_maxsz) #define NFS4_enc_setclientid_confirm_sz \ (compound_encode_hdr_maxsz + \ - encode_setclientid_confirm_maxsz + \ - encode_putrootfh_maxsz + \ - encode_fsinfo_maxsz) + encode_setclientid_confirm_maxsz) #define NFS4_dec_setclientid_confirm_sz \ (compound_decode_hdr_maxsz + \ - decode_setclientid_confirm_maxsz + \ - decode_putrootfh_maxsz + \ - decode_fsinfo_maxsz) + decode_setclientid_confirm_maxsz) #define NFS4_enc_lock_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -857,6 +867,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH + decode_sequence_maxsz + decode_putfh_maxsz) * XDR_UNIT); + +const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH + + compound_decode_hdr_maxsz + + decode_sequence_maxsz) * + XDR_UNIT); +EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead); #endif /* CONFIG_NFS_V4_1 */ static const umode_t nfs_type2fmt[] = { @@ -972,7 +988,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE); } -static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server) +static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, + const struct nfs4_label *label, + const struct nfs_server *server) { char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; @@ -983,15 +1001,16 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const int len; uint32_t bmval0 = 0; uint32_t bmval1 = 0; + uint32_t bmval2 = 0; /* * We reserve enough space to write the entire attribute buffer at once. * In the worst-case, this would be - * 12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) - * = 36 bytes, plus any contribution from variable-length fields + * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) + * = 40 bytes, plus any contribution from variable-length fields * such as owner/group. */ - len = 16; + len = 20; /* Sigh */ if (iap->ia_valid & ATTR_SIZE) @@ -1021,6 +1040,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const } len += 4 + (XDR_QUADLEN(owner_grouplen) << 2); } + if (label) + len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); if (iap->ia_valid & ATTR_ATIME_SET) len += 16; else if (iap->ia_valid & ATTR_ATIME) @@ -1035,9 +1056,9 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const * We write the bitmap length now, but leave the bitmap and the attribute * buffer length to be backfilled at the end of this routine. */ - *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(3); q = p; - p += 3; + p += 4; if (iap->ia_valid & ATTR_SIZE) { bmval0 |= FATTR4_WORD0_SIZE; @@ -1058,8 +1079,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const if (iap->ia_valid & ATTR_ATIME_SET) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(iap->ia_atime.tv_sec); + p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec); *p++ = cpu_to_be32(iap->ia_atime.tv_nsec); } else if (iap->ia_valid & ATTR_ATIME) { @@ -1069,14 +1089,20 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const if (iap->ia_valid & ATTR_MTIME_SET) { bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); + p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec); *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); } else if (iap->ia_valid & ATTR_MTIME) { bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } + if (label) { + bmval2 |= FATTR4_WORD2_SECURITY_LABEL; + *p++ = cpu_to_be32(label->lfs); + *p++ = cpu_to_be32(label->pi); + *p++ = cpu_to_be32(label->len); + p = xdr_encode_opaque_fixed(p, label->label, label->len); + } /* * Now we backfill the bitmap and the attribute buffer length. @@ -1086,9 +1112,10 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len, ((char *)p - (char *)q) + 4); BUG(); } - len = (char *)p - (char *)q - 12; + len = (char *)p - (char *)q - 16; *q++ = htonl(bmval0); *q++ = htonl(bmval1); + *q++ = htonl(bmval2); *q = htonl(len); /* out: */ @@ -1142,7 +1169,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * } encode_string(xdr, create->name->len, create->name->name); - encode_attrs(xdr, create->attrs, create->server); + encode_attrs(xdr, create->attrs, create->label, create->server); } static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) @@ -1194,8 +1221,10 @@ encode_getattr_three(struct xdr_stream *xdr, static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], - bitmask[1] & nfs4_fattr_bitmap[1], hdr); + encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0], + bitmask[1] & nfs4_fattr_bitmap[1], + bitmask[2] & nfs4_fattr_bitmap[2], + hdr); } static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask, @@ -1366,33 +1395,28 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) { + struct iattr dummy; __be32 *p; - struct nfs_client *clp; p = reserve_space(xdr, 4); - switch(arg->open_flags & O_EXCL) { - case 0: + switch(arg->createmode) { + case NFS4_CREATE_UNCHECKED: *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); - encode_attrs(xdr, arg->u.attrs, arg->server); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); break; - default: - clp = arg->server->nfs_client; - if (clp->cl_mvops->minor_version > 0) { - if (nfs4_has_persistent_session(clp)) { - *p = cpu_to_be32(NFS4_CREATE_GUARDED); - encode_attrs(xdr, arg->u.attrs, arg->server); - } else { - struct iattr dummy; - - *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); - encode_nfs4_verifier(xdr, &arg->u.verifier); - dummy.ia_valid = 0; - encode_attrs(xdr, &dummy, arg->server); - } - } else { - *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); - encode_nfs4_verifier(xdr, &arg->u.verifier); - } + case NFS4_CREATE_GUARDED: + *p = cpu_to_be32(NFS4_CREATE_GUARDED); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); + break; + case NFS4_CREATE_EXCLUSIVE: + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); + encode_nfs4_verifier(xdr, &arg->u.verifier); + break; + case NFS4_CREATE_EXCLUSIVE4_1: + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); + encode_nfs4_verifier(xdr, &arg->u.verifier); + dummy.ia_valid = 0; + encode_attrs(xdr, &dummy, arg->label, arg->server); } } @@ -1459,6 +1483,23 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc encode_string(xdr, name->len, name->name); } +static inline void encode_claim_fh(struct xdr_stream *xdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_FH); +} + +static inline void encode_claim_delegate_cur_fh(struct xdr_stream *xdr, const nfs4_stateid *stateid) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEG_CUR_FH); + encode_nfs4_stateid(xdr, stateid); +} + static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr) { encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr); @@ -1474,6 +1515,12 @@ static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, case NFS4_OPEN_CLAIM_DELEGATE_CUR: encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation); break; + case NFS4_OPEN_CLAIM_FH: + encode_claim_fh(xdr); + break; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + encode_claim_delegate_cur_fh(xdr, &arg->u.delegation); + break; default: BUG(); } @@ -1506,35 +1553,12 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr); } -static void encode_open_stateid(struct xdr_stream *xdr, - const struct nfs_open_context *ctx, - const struct nfs_lock_context *l_ctx, - fmode_t fmode, - int zero_seqid) -{ - nfs4_stateid stateid; - - if (ctx->state != NULL) { - const struct nfs_lockowner *lockowner = NULL; - - if (l_ctx != NULL) - lockowner = &l_ctx->lockowner; - nfs4_select_rw_stateid(&stateid, ctx->state, - fmode, lockowner); - if (zero_seqid) - stateid.seqid = 0; - encode_nfs4_stateid(xdr, &stateid); - } else - encode_nfs4_stateid(xdr, &zero_stateid); -} - static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) { __be32 *p; encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr); - encode_open_stateid(xdr, args->context, args->lock_context, - FMODE_READ, hdr->minorversion); + encode_nfs4_stateid(xdr, &args->stateid); p = reserve_space(xdr, 12); p = xdr_encode_hyper(p, args->offset); @@ -1543,7 +1567,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) { - uint32_t attrs[2] = { + uint32_t attrs[3] = { FATTR4_WORD0_RDATTR_ERROR, FATTR4_WORD1_MOUNTED_ON_FILEID, }; @@ -1566,20 +1590,26 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr); encode_uint64(xdr, readdir->cookie); encode_nfs4_verifier(xdr, &readdir->verifier); - p = reserve_space(xdr, 20); + p = reserve_space(xdr, encode_readdir_space); *p++ = cpu_to_be32(dircount); *p++ = cpu_to_be32(readdir->count); - *p++ = cpu_to_be32(2); - + *p++ = cpu_to_be32(encode_readdir_bitmask_sz); *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); - *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); + *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); + if (encode_readdir_bitmask_sz > 2) { + if (hdr->minorversion > 1) + attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; + p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]); + } memcpy(verf, readdir->verifier.data, sizeof(verf)); - dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", + + dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n", __func__, (unsigned long long)readdir->cookie, verf[0], verf[1], attrs[0] & readdir->bitmask[0], - attrs[1] & readdir->bitmask[1]); + attrs[1] & readdir->bitmask[1], + attrs[2] & readdir->bitmask[2]); } static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr) @@ -1638,7 +1668,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); encode_nfs4_stateid(xdr, &arg->stateid); - encode_attrs(xdr, arg->iap, server); + encode_attrs(xdr, arg->iap, arg->label, server); } static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) @@ -1670,8 +1700,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg __be32 *p; encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr); - encode_open_stateid(xdr, args->context, args->lock_context, - FMODE_WRITE, hdr->minorversion); + encode_nfs4_stateid(xdr, &args->stateid); p = reserve_space(xdr, 16); p = xdr_encode_hyper(p, args->offset); @@ -1901,7 +1930,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr, p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, NFS4_DEVICEID4_SIZE); *p++ = cpu_to_be32(args->pdev->layout_type); - *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */ + *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */ *p++ = cpu_to_be32(0); /* bitmap length 0 */ } @@ -2015,7 +2044,7 @@ static void encode_free_stateid(struct xdr_stream *xdr, struct compound_hdr *hdr) { encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr); - encode_nfs4_stateid(xdr, args->stateid); + encode_nfs4_stateid(xdr, &args->stateid); } #endif /* CONFIG_NFS_V4_1 */ @@ -2609,12 +2638,9 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, struct compound_hdr hdr = { .nops = 0, }; - const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME }; encode_compound_hdr(xdr, req, &hdr); encode_setclientid_confirm(xdr, arg, &hdr); - encode_putrootfh(xdr, &hdr); - encode_fsinfo(xdr, lease_bitmap, &hdr); encode_nops(&hdr); } @@ -3497,8 +3523,11 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) if (n == 0) goto root_path; dprintk("pathname4: "); - path->ncomponents = 0; - while (path->ncomponents < n) { + if (n > NFS4_PATHNAME_MAXCOMPONENTS) { + dprintk("cannot parse %d components in path\n", n); + goto out_eio; + } + for (path->ncomponents = 0; path->ncomponents < n; path->ncomponents++) { struct nfs4_string *component = &path->components[path->ncomponents]; status = decode_opaque_inline(xdr, &component->len, &component->data); if (unlikely(status != 0)) @@ -3507,12 +3536,6 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) pr_cont("%s%.*s ", (path->ncomponents != n ? "/ " : ""), component->len, component->data); - if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS) - path->ncomponents++; - else { - dprintk("cannot parse %d components in path\n", n); - goto out_eio; - } } out: return status; @@ -3557,27 +3580,23 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st n = be32_to_cpup(p); if (n <= 0) goto out_eio; - res->nlocations = 0; - while (res->nlocations < n) { + for (res->nlocations = 0; res->nlocations < n; res->nlocations++) { u32 m; - struct nfs4_fs_location *loc = &res->locations[res->nlocations]; + struct nfs4_fs_location *loc; + if (res->nlocations == NFS4_FS_LOCATIONS_MAXENTRIES) + break; + loc = &res->locations[res->nlocations]; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; m = be32_to_cpup(p); - loc->nservers = 0; dprintk("%s: servers:\n", __func__); - while (loc->nservers < m) { - struct nfs4_string *server = &loc->servers[loc->nservers]; - status = decode_opaque_inline(xdr, &server->len, &server->data); - if (unlikely(status != 0)) - goto out_eio; - dprintk("%s ", server->data); - if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS) - loc->nservers++; - else { + for (loc->nservers = 0; loc->nservers < m; loc->nservers++) { + struct nfs4_string *server; + + if (loc->nservers == NFS4_FS_LOCATION_MAXSERVERS) { unsigned int i; dprintk("%s: using first %u of %u servers " "returned for location %u\n", @@ -3591,13 +3610,17 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st if (unlikely(status != 0)) goto out_eio; } + break; } + server = &loc->servers[loc->nservers]; + status = decode_opaque_inline(xdr, &server->len, &server->data); + if (unlikely(status != 0)) + goto out_eio; + dprintk("%s ", server->data); } status = decode_pathname(xdr, &loc->rootpath); if (unlikely(status != 0)) goto out_eio; - if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES) - res->nlocations++; } if (res->nlocations != 0) status = NFS_ATTR_FATTR_V4_LOCATIONS; @@ -4056,6 +4079,56 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap, return status; } +static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs4_label *label) +{ + uint32_t pi = 0; + uint32_t lfs = 0; + __u32 len; + __be32 *p; + int status = 0; + + if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U))) + return -EIO; + if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + lfs = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + pi = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (len < NFS4_MAXLABELLEN) { + if (label) { + memcpy(label->label, p, len); + label->len = len; + label->pi = pi; + label->lfs = lfs; + status = NFS_ATTR_FATTR_V4_SECURITY_LABEL; + } + bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + } else + printk(KERN_WARNING "%s: label too long (%u)!\n", + __func__, len); + } + if (label && label->label) + dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__, + (char *)label->label, label->len, label->pi, label->lfs); + return status; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) { int status = 0; @@ -4398,7 +4471,7 @@ out_overflow: static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fattr *fattr, struct nfs_fh *fh, - struct nfs4_fs_locations *fs_loc, + struct nfs4_fs_locations *fs_loc, struct nfs4_label *label, const struct nfs_server *server) { int status; @@ -4506,6 +4579,13 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, if (status < 0) goto xdr_error; + if (label) { + status = decode_attr_security_label(xdr, bitmap, label); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + } + xdr_error: dprintk("%s: xdr returned %d\n", __func__, -status); return status; @@ -4513,7 +4593,7 @@ xdr_error: static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc, - const struct nfs_server *server) + struct nfs4_label *label, const struct nfs_server *server) { unsigned int savep; uint32_t attrlen, @@ -4532,7 +4612,8 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat if (status < 0) goto xdr_error; - status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server); + status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, + label, server); if (status < 0) goto xdr_error; @@ -4542,10 +4623,16 @@ xdr_error: return status; } +static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct nfs4_label *label, const struct nfs_server *server) +{ + return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server); +} + static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server) { - return decode_getfattr_generic(xdr, fattr, NULL, NULL, server); + return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server); } /* @@ -5209,27 +5296,30 @@ static int decode_delegreturn(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_DELEGRETURN); } -static int decode_secinfo_gss(struct xdr_stream *xdr, struct nfs4_secinfo_flavor *flavor) +static int decode_secinfo_gss(struct xdr_stream *xdr, + struct nfs4_secinfo4 *flavor) { + u32 oid_len; __be32 *p; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - flavor->gss.sec_oid4.len = be32_to_cpup(p); - if (flavor->gss.sec_oid4.len > GSS_OID_MAX_LEN) + oid_len = be32_to_cpup(p); + if (oid_len > GSS_OID_MAX_LEN) goto out_err; - p = xdr_inline_decode(xdr, flavor->gss.sec_oid4.len); + p = xdr_inline_decode(xdr, oid_len); if (unlikely(!p)) goto out_overflow; - memcpy(flavor->gss.sec_oid4.data, p, flavor->gss.sec_oid4.len); + memcpy(flavor->flavor_info.oid.data, p, oid_len); + flavor->flavor_info.oid.len = oid_len; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - flavor->gss.qop4 = be32_to_cpup(p++); - flavor->gss.service = be32_to_cpup(p); + flavor->flavor_info.qop = be32_to_cpup(p++); + flavor->flavor_info.service = be32_to_cpup(p); return 0; @@ -5242,10 +5332,10 @@ out_err: static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) { - struct nfs4_secinfo_flavor *sec_flavor; + struct nfs4_secinfo4 *sec_flavor; + unsigned int i, num_flavors; int status; __be32 *p; - int i, num_flavors; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) @@ -5934,7 +6024,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_getfh(xdr, res->fh); if (status) goto out; - status = decode_getfattr(xdr, res->fattr, res->server); + status = decode_getfattr_label(xdr, res->fattr, res->label, res->server); out: return status; } @@ -5960,7 +6050,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, goto out; status = decode_getfh(xdr, res->fh); if (status == 0) - status = decode_getfattr(xdr, res->fattr, res->server); + status = decode_getfattr_label(xdr, res->fattr, + res->label, res->server); out: return status; } @@ -6051,7 +6142,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_restorefh(xdr); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server); + decode_getfattr_label(xdr, res->fattr, res->label, res->server); out: return status; } @@ -6080,7 +6171,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_getfh(xdr, res->fh); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server); + decode_getfattr_label(xdr, res->fattr, res->label, res->server); out: return status; } @@ -6112,7 +6203,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; - status = decode_getfattr(xdr, res->fattr, res->server); + status = decode_getfattr_label(xdr, res->fattr, res->label, res->server); out: return status; } @@ -6245,7 +6336,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, goto out; if (res->access_request) decode_access(xdr, &res->access_supported, &res->access_result); - decode_getfattr(xdr, res->f_attr, res->server); + decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server); out: return status; } @@ -6322,7 +6413,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, status = decode_setattr(xdr); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server); + decode_getfattr_label(xdr, res->fattr, res->label, res->server); out: return status; } @@ -6648,8 +6739,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, * Decode SETCLIENTID_CONFIRM response */ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, - struct xdr_stream *xdr, - struct nfs_fsinfo *fsinfo) + struct xdr_stream *xdr) { struct compound_hdr hdr; int status; @@ -6657,10 +6747,6 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, status = decode_compound_hdr(xdr, &hdr); if (!status) status = decode_setclientid_confirm(xdr); - if (!status) - status = decode_putrootfh(xdr); - if (!status) - status = decode_fsinfo(xdr, fsinfo); return status; } @@ -6716,7 +6802,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, xdr_enter_page(xdr, PAGE_SIZE); status = decode_getfattr_generic(xdr, &res->fs_locations->fattr, NULL, res->fs_locations, - res->fs_locations->server); + NULL, res->fs_locations->server); out: return status; } @@ -7129,7 +7215,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, goto out_overflow; if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, - NULL, entry->server) < 0) + NULL, entry->label, entry->server) < 0) goto out_overflow; if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) entry->ino = entry->fattr->mounted_on_fileid; diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 88f9611a945c..5457745dd4f1 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -234,7 +234,7 @@ static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, lseg = kzalloc(lseg_size, gfp_flags); if (unlikely(!lseg)) { - dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__, + dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__, numdevs, lseg_size); return -ENOMEM; } diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index a9ebd817278b..e4f9cbfec67b 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -613,8 +613,10 @@ int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, pd.pgbase = 0; pd.pglen = PAGE_SIZE; pd.mincount = 0; + pd.maxcount = PAGE_SIZE; - err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd); + err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd, + pnfslay->plh_lc_cred); dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); if (err) goto err_out; diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 880ba086be94..87aa1dec6120 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -114,7 +114,7 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, gfp_t gfp_flags); extern void objio_free_lseg(struct pnfs_layout_segment *lseg); -/* objio_free_result will free these @oir structs recieved from +/* objio_free_result will free these @oir structs received from * objlayout_{read,write}_done */ extern void objio_free_result(struct objlayout_io_res *oir); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index e56e846e9d2d..29cfb7ade121 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -84,6 +84,55 @@ nfs_page_free(struct nfs_page *p) kmem_cache_free(nfs_page_cachep, p); } +static void +nfs_iocounter_inc(struct nfs_io_counter *c) +{ + atomic_inc(&c->io_count); +} + +static void +nfs_iocounter_dec(struct nfs_io_counter *c) +{ + if (atomic_dec_and_test(&c->io_count)) { + clear_bit(NFS_IO_INPROGRESS, &c->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&c->flags, NFS_IO_INPROGRESS); + } +} + +static int +__nfs_iocounter_wait(struct nfs_io_counter *c) +{ + wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS); + DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS); + int ret = 0; + + do { + prepare_to_wait(wq, &q.wait, TASK_KILLABLE); + set_bit(NFS_IO_INPROGRESS, &c->flags); + if (atomic_read(&c->io_count) == 0) + break; + ret = nfs_wait_bit_killable(&c->flags); + } while (atomic_read(&c->io_count) != 0); + finish_wait(wq, &q.wait); + return ret; +} + +/** + * nfs_iocounter_wait - wait for i/o to complete + * @c: nfs_io_counter to use + * + * returns -ERESTARTSYS if interrupted by a fatal signal. + * Otherwise returns 0 once the io_count hits 0. + */ +int +nfs_iocounter_wait(struct nfs_io_counter *c) +{ + if (atomic_read(&c->io_count) == 0) + return 0; + return __nfs_iocounter_wait(c); +} + /** * nfs_create_request - Create an NFS read/write request. * @ctx: open context to use @@ -104,6 +153,8 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, struct nfs_page *req; struct nfs_lock_context *l_ctx; + if (test_bit(NFS_CONTEXT_BAD, &ctx->flags)) + return ERR_PTR(-EBADF); /* try to allocate the request struct */ req = nfs_page_alloc(); if (req == NULL) @@ -116,6 +167,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, return ERR_CAST(l_ctx); } req->wb_lock_context = l_ctx; + nfs_iocounter_inc(&l_ctx->io_count); /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in @@ -175,6 +227,7 @@ static void nfs_clear_request(struct nfs_page *req) req->wb_page = NULL; } if (l_ctx != NULL) { + nfs_iocounter_dec(&l_ctx->io_count); nfs_put_lock_context(l_ctx); req->wb_lock_context = NULL; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4bdffe0ba025..3a3a79d6bf15 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -360,7 +360,7 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) } EXPORT_SYMBOL_GPL(pnfs_put_lseg); -static inline u64 +static u64 end_offset(u64 start, u64 len) { u64 end; @@ -376,9 +376,9 @@ end_offset(u64 start, u64 len) * start2 end2 * [----------------) */ -static inline int -lo_seg_contained(struct pnfs_layout_range *l1, - struct pnfs_layout_range *l2) +static bool +pnfs_lseg_range_contained(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) { u64 start1 = l1->offset; u64 end1 = end_offset(start1, l1->length); @@ -395,9 +395,9 @@ lo_seg_contained(struct pnfs_layout_range *l1, * start2 end2 * [----------------) */ -static inline int -lo_seg_intersecting(struct pnfs_layout_range *l1, - struct pnfs_layout_range *l2) +static bool +pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) { u64 start1 = l1->offset; u64 end1 = end_offset(start1, l1->length); @@ -409,12 +409,12 @@ lo_seg_intersecting(struct pnfs_layout_range *l1, } static bool -should_free_lseg(struct pnfs_layout_range *lseg_range, - struct pnfs_layout_range *recall_range) +should_free_lseg(const struct pnfs_layout_range *lseg_range, + const struct pnfs_layout_range *recall_range) { return (recall_range->iomode == IOMODE_ANY || lseg_range->iomode == recall_range->iomode) && - lo_seg_intersecting(lseg_range, recall_range); + pnfs_lseg_range_intersecting(lseg_range, recall_range); } static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, @@ -718,6 +718,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, spin_lock(&lo->plh_inode->i_lock); if (pnfs_layoutgets_blocked(lo, 1)) { status = -EAGAIN; + } else if (!nfs4_valid_open_stateid(open_state)) { + status = -EBADF; } else if (list_empty(&lo->plh_segs)) { int seq; @@ -764,6 +766,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp->args.inode = ino; lgp->args.ctx = get_nfs_open_context(ctx); lgp->gfp_flags = gfp_flags; + lgp->cred = lo->plh_lc_cred; /* Synchronously retrieve layout information from server and * store in lseg. @@ -858,6 +861,7 @@ _pnfs_return_layout(struct inode *ino) lrp->args.inode = ino; lrp->args.layout = lo; lrp->clp = NFS_SERVER(ino)->nfs_client; + lrp->cred = lo->plh_lc_cred; status = nfs4_proc_layoutreturn(lrp); out: @@ -982,8 +986,8 @@ out: * are seen first. */ static s64 -cmp_layout(struct pnfs_layout_range *l1, - struct pnfs_layout_range *l2) +pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) { s64 d; @@ -1010,7 +1014,7 @@ pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, dprintk("%s:Begin\n", __func__); list_for_each_entry(lp, &lo->plh_segs, pls_list) { - if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0) + if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0) continue; list_add_tail(&lseg->pls_list, &lp->pls_list); dprintk("%s: inserted lseg %p " @@ -1048,7 +1052,7 @@ alloc_init_layout_hdr(struct inode *ino, INIT_LIST_HEAD(&lo->plh_segs); INIT_LIST_HEAD(&lo->plh_bulk_destroy); lo->plh_inode = ino; - lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); + lo->plh_lc_cred = get_rpccred(ctx->cred); return lo; } @@ -1089,21 +1093,21 @@ out_existing: * READ READ true * READ RW true */ -static int -is_matching_lseg(struct pnfs_layout_range *ls_range, - struct pnfs_layout_range *range) +static bool +pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, + const struct pnfs_layout_range *range) { struct pnfs_layout_range range1; if ((range->iomode == IOMODE_RW && ls_range->iomode != IOMODE_RW) || - !lo_seg_intersecting(ls_range, range)) + !pnfs_lseg_range_intersecting(ls_range, range)) return 0; /* range1 covers only the first byte in the range */ range1 = *range; range1.length = 1; - return lo_seg_contained(ls_range, &range1); + return pnfs_lseg_range_contained(ls_range, &range1); } /* @@ -1119,7 +1123,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && - is_matching_lseg(&lseg->pls_range, range)) { + pnfs_lseg_range_match(&lseg->pls_range, range)) { ret = pnfs_get_lseg(lseg); break; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f5f8a470a647..a4f41810a7f4 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -149,9 +149,10 @@ struct pnfs_device { struct nfs4_deviceid dev_id; unsigned int layout_type; unsigned int mincount; + unsigned int maxcount; /* gdia_maxcount */ struct page **pages; unsigned int pgbase; - unsigned int pglen; + unsigned int pglen; /* reply buffer length */ }; #define NFS4_PNFS_GETDEVLIST_MAXNUM 16 @@ -170,7 +171,8 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server, const struct nfs_fh *fh, struct pnfs_devicelist *devlist); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, - struct pnfs_device *dev); + struct pnfs_device *dev, + struct rpc_cred *cred); extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index fc8de9016acf..c041c41f7a52 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -98,7 +98,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, */ static int nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, struct nfs4_label *label) { struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_GETATTR], @@ -146,7 +146,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, static int nfs_proc_lookup(struct inode *dir, struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + struct nfs4_label *label) { struct nfs_diropargs arg = { .fh = NFS_FH(dir), @@ -243,7 +244,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); nfs_free_createdata(data); out: dprintk("NFS reply create: %d\n", status); @@ -290,7 +291,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); } if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); nfs_free_createdata(data); out: dprintk("NFS reply mknod: %d\n", status); @@ -442,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, * should fill in the data with a LOOKUP call on the wire. */ if (status == 0) - status = nfs_instantiate(dentry, fh, fattr); + status = nfs_instantiate(dentry, fh, fattr, NULL); out_free: nfs_free_fattr(fattr); @@ -471,7 +472,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); nfs_free_createdata(data); out: dprintk("NFS reply mkdir: %d\n", status); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index a5e5d9899d56..70a26c651f09 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -514,6 +514,8 @@ void nfs_read_prepare(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data); + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) + rpc_exit(task, -EIO); } static const struct rpc_call_ops nfs_read_common_ops = { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2f8a29db0f1b..71fdc0dfa0d2 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -269,7 +269,7 @@ static match_table_t nfs_local_lock_tokens = { enum { Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0, - Opt_vers_4_1, + Opt_vers_4_1, Opt_vers_4_2, Opt_vers_err }; @@ -280,6 +280,7 @@ static match_table_t nfs_vers_tokens = { { Opt_vers_4, "4" }, { Opt_vers_4_0, "4.0" }, { Opt_vers_4_1, "4.1" }, + { Opt_vers_4_2, "4.2" }, { Opt_vers_err, NULL } }; @@ -832,6 +833,7 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) seq_printf(m, "\n\tnfsv4:\t"); seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]); seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); show_sessions(m, nfss); show_pnfs(m, nfss); @@ -920,7 +922,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) data->mount_server.port = NFS_UNSPEC_PORT; data->nfs_server.port = NFS_UNSPEC_PORT; data->nfs_server.protocol = XPRT_TRANSPORT_TCP; - data->auth_flavors[0] = RPC_AUTH_UNIX; + data->auth_flavors[0] = RPC_AUTH_MAXFLAVOR; data->auth_flavor_len = 1; data->minorversion = 0; data->need_mount = true; @@ -1097,6 +1099,10 @@ static int nfs_parse_version_string(char *string, mnt->version = 4; mnt->minorversion = 1; break; + case Opt_vers_4_2: + mnt->version = 4; + mnt->minorversion = 2; + break; default: return 0; } @@ -1608,49 +1614,35 @@ out_security_failure: } /* - * Match the requested auth flavors with the list returned by - * the server. Returns zero and sets the mount's authentication - * flavor on success; returns -EACCES if server does not support - * the requested flavor. + * Ensure that the specified authtype in args->auth_flavors[0] is supported by + * the server. Returns 0 if it's ok, and -EACCES if not. */ -static int nfs_walk_authlist(struct nfs_parsed_mount_data *args, - struct nfs_mount_request *request) +static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args, + rpc_authflavor_t *server_authlist, unsigned int count) { - unsigned int i, j, server_authlist_len = *(request->auth_flav_len); - - /* - * Certain releases of Linux's mountd return an empty - * flavor list. To prevent behavioral regression with - * these servers (ie. rejecting mounts that used to - * succeed), revert to pre-2.6.32 behavior (no checking) - * if the returned flavor list is empty. - */ - if (server_authlist_len == 0) - return 0; + unsigned int i; /* - * We avoid sophisticated negotiating here, as there are - * plenty of cases where we can get it wrong, providing - * either too little or too much security. + * If the sec= mount option is used, the specified flavor or AUTH_NULL + * must be in the list returned by the server. * - * RFC 2623, section 2.7 suggests we SHOULD prefer the - * flavor listed first. However, some servers list - * AUTH_NULL first. Our caller plants AUTH_SYS, the - * preferred default, in args->auth_flavors[0] if user - * didn't specify sec= mount option. + * AUTH_NULL has a special meaning when it's in the server list - it + * means that the server will ignore the rpc creds, so any flavor + * can be used. */ - for (i = 0; i < args->auth_flavor_len; i++) - for (j = 0; j < server_authlist_len; j++) - if (args->auth_flavors[i] == request->auth_flavs[j]) { - dfprintk(MOUNT, "NFS: using auth flavor %d\n", - request->auth_flavs[j]); - args->auth_flavors[0] = request->auth_flavs[j]; - return 0; - } + for (i = 0; i < count; i++) { + if (args->auth_flavors[0] == server_authlist[i] || + server_authlist[i] == RPC_AUTH_NULL) + goto out; + } - dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n"); - nfs_umount(request); + dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n", + args->auth_flavors[0]); return -EACCES; + +out: + dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]); + return 0; } /* @@ -1658,10 +1650,10 @@ static int nfs_walk_authlist(struct nfs_parsed_mount_data *args, * corresponding to the provided path. */ static int nfs_request_mount(struct nfs_parsed_mount_data *args, - struct nfs_fh *root_fh) + struct nfs_fh *root_fh, + rpc_authflavor_t *server_authlist, + unsigned int *server_authlist_len) { - rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS]; - unsigned int server_authlist_len = ARRAY_SIZE(server_authlist); struct nfs_mount_request request = { .sap = (struct sockaddr *) &args->mount_server.address, @@ -1669,7 +1661,7 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args, .protocol = args->mount_server.protocol, .fh = root_fh, .noresvport = args->flags & NFS_MOUNT_NORESVPORT, - .auth_flav_len = &server_authlist_len, + .auth_flav_len = server_authlist_len, .auth_flavs = server_authlist, .net = args->net, }; @@ -1713,29 +1705,92 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args, return status; } + return 0; +} + +static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + int status; + unsigned int i; + bool tried_auth_unix = false; + bool auth_null_in_list = false; + struct nfs_server *server = ERR_PTR(-EACCES); + struct nfs_parsed_mount_data *args = mount_info->parsed; + rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS]; + unsigned int authlist_len = ARRAY_SIZE(authlist); + + status = nfs_request_mount(args, mount_info->mntfh, authlist, + &authlist_len); + if (status) + return ERR_PTR(status); + /* - * MNTv1 (NFSv2) does not support auth flavor negotiation. + * Was a sec= authflavor specified in the options? First, verify + * whether the server supports it, and then just try to use it if so. */ - if (args->mount_server.version != NFS_MNT3_VERSION) - return 0; - return nfs_walk_authlist(args, &request); + if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) { + status = nfs_verify_authflavor(args, authlist, authlist_len); + dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]); + if (status) + return ERR_PTR(status); + return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); + } + + /* + * No sec= option was provided. RFC 2623, section 2.7 suggests we + * SHOULD prefer the flavor listed first. However, some servers list + * AUTH_NULL first. Avoid ever choosing AUTH_NULL. + */ + for (i = 0; i < authlist_len; ++i) { + rpc_authflavor_t flavor; + struct rpcsec_gss_info info; + + flavor = authlist[i]; + switch (flavor) { + case RPC_AUTH_UNIX: + tried_auth_unix = true; + break; + case RPC_AUTH_NULL: + auth_null_in_list = true; + continue; + default: + if (rpcauth_get_gssinfo(flavor, &info) != 0) + continue; + /* Fallthrough */ + } + dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor); + args->auth_flavors[0] = flavor; + server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); + if (!IS_ERR(server)) + return server; + } + + /* + * Nothing we tried so far worked. At this point, give up if we've + * already tried AUTH_UNIX or if the server's list doesn't contain + * AUTH_NULL + */ + if (tried_auth_unix || !auth_null_in_list) + return server; + + /* Last chance! Try AUTH_UNIX */ + dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX); + args->auth_flavors[0] = RPC_AUTH_UNIX; + return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); } struct dentry *nfs_try_mount(int flags, const char *dev_name, struct nfs_mount_info *mount_info, struct nfs_subversion *nfs_mod) { - int status; struct nfs_server *server; - if (mount_info->parsed->need_mount) { - status = nfs_request_mount(mount_info->parsed, mount_info->mntfh); - if (status) - return ERR_PTR(status); - } + if (mount_info->parsed->need_mount) + server = nfs_try_mount_request(mount_info, nfs_mod); + else + server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); - /* Get a volume representation */ - server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); if (IS_ERR(server)) return ERR_CAST(server); @@ -1904,6 +1959,7 @@ static int nfs23_validate_mount_data(void *options, args->namlen = data->namlen; args->bsize = data->bsize; + args->auth_flavors[0] = RPC_AUTH_UNIX; if (data->flags & NFS_MOUNT_SECFLAVOUR) args->auth_flavors[0] = data->pseudoflavor; if (!args->nfs_server.hostname) @@ -2373,7 +2429,21 @@ static int nfs_bdi_register(struct nfs_server *server) int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot, struct nfs_mount_info *mount_info) { - return security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts); + int error; + unsigned long kflags = 0, kflags_out = 0; + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) + kflags |= SECURITY_LSM_NATIVE_LABELS; + + error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts, + kflags, &kflags_out); + if (error) + goto err; + + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && + !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) + NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; +err: + return error; } EXPORT_SYMBOL_GPL(nfs_set_sb_security); @@ -2381,10 +2451,9 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, struct nfs_mount_info *mount_info) { /* clone any lsm security options from the parent to the new sb */ - security_sb_clone_mnt_opts(mount_info->cloned->sb, s); if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) return -ESTALE; - return 0; + return security_sb_clone_mnt_opts(mount_info->cloned->sb, s); } EXPORT_SYMBOL_GPL(nfs_clone_sb_security); @@ -2600,6 +2669,7 @@ static int nfs4_validate_mount_data(void *options, goto out_no_address; args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); + args->auth_flavors[0] = RPC_AUTH_UNIX; if (data->auth_flavourlen) { if (data->auth_flavourlen > 1) goto out_inval_auth; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 1f1f38f0c5d5..60395ad3a2e4 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -479,7 +479,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", dentry->d_parent->d_name.name, dentry->d_name.name, - dentry->d_count); + d_count(dentry)); nfs_inc_stats(dir, NFSIOS_SILLYRENAME); /* diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c483cc50b82e..a2c7c28049d5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1251,6 +1251,8 @@ void nfs_write_prepare(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data); + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) + rpc_exit(task, -EIO); } void nfs_commit_prepare(struct rpc_task *task, void *calldata) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 430b6872806f..dc8f1ef665ce 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -81,6 +81,22 @@ config NFSD_V4 If unsure, say N. +config NFSD_V4_SECURITY_LABEL + bool "Provide Security Label support for NFSv4 server" + depends on NFSD_V4 && SECURITY + help + + Say Y here if you want enable fine-grained security label attribute + support for NFS version 4. Security labels allow security modules like + SELinux and Smack to label files to facilitate enforcement of their policies. + Without this an NFSv4 mount will have the same label on each file. + + If you do not wish to enable fine-grained security labels SELinux or + Smack policies on NFSv4 files, say N. + + WARNING: there is still a chance of backwards-incompatible protocol changes. + For now we recommend "Y" only for developers and testers." + config NFSD_FAULT_INJECTION bool "NFS server manual fault injection" depends on NFSD_V4 && DEBUG_KERNEL diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 87fd1410b737..d5c5b3e00266 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -82,6 +82,7 @@ int nfsd_reply_cache_init(void); void nfsd_reply_cache_shutdown(void); int nfsd_cache_lookup(struct svc_rqst *); void nfsd_cache_update(struct svc_rqst *, int, __be32 *); +int nfsd_reply_cache_stats_open(struct inode *, struct file *); #ifdef CONFIG_NFSD_V4 void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp); diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 1051bebff1b0..849a7c3ced22 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -80,6 +80,7 @@ struct nfsd_net { */ struct list_head client_lru; struct list_head close_lru; + struct list_head del_recall_lru; struct delayed_work laundromat_work; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 99bc85ff0217..7f05cd140de3 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -37,6 +37,7 @@ #include "nfsd.h" #include "state.h" #include "netns.h" +#include "xdr4cb.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -53,30 +54,6 @@ enum { NFSPROC4_CLNT_CB_SEQUENCE, }; -#define NFS4_MAXTAGLEN 20 - -#define NFS4_enc_cb_null_sz 0 -#define NFS4_dec_cb_null_sz 0 -#define cb_compound_enc_hdr_sz 4 -#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) -#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2) -#define cb_sequence_enc_sz (sessionid_sz + 4 + \ - 1 /* no referring calls list yet */) -#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4) - -#define op_enc_sz 1 -#define op_dec_sz 2 -#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) -#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2) -#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ - cb_sequence_enc_sz + \ - 1 + enc_stateid_sz + \ - enc_nfs4_fh_sz) - -#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ - cb_sequence_dec_sz + \ - op_dec_sz) - struct nfs4_cb_compound_hdr { /* args */ u32 ident; /* minorversion 0 only */ @@ -817,8 +794,7 @@ static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task) static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; - struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_stid.sc_client; + struct nfs4_client *clp = cb->cb_clp; u32 minorversion = clp->cl_minorversion; cb->cb_minorversion = minorversion; @@ -839,8 +815,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) static void nfsd4_cb_done(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; - struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_stid.sc_client; + struct nfs4_client *clp = cb->cb_clp; dprintk("%s: minorversion=%d\n", __func__, clp->cl_minorversion); @@ -863,7 +838,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_stid.sc_client; + struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *current_rpc_client = clp->cl_cb_client; nfsd4_cb_done(task, calldata); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index ae73175e6e68..a7cee864e7b2 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -42,6 +42,36 @@ #include "current_stateid.h" #include "netns.h" +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +#include <linux/security.h> + +static inline void +nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval) +{ + struct inode *inode = resfh->fh_dentry->d_inode; + int status; + + mutex_lock(&inode->i_mutex); + status = security_inode_setsecctx(resfh->fh_dentry, + label->data, label->len); + mutex_unlock(&inode->i_mutex); + + if (status) + /* + * XXX: We should really fail the whole open, but we may + * already have created a new file, so it may be too + * late. For now this seems the least of evils: + */ + bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + + return; +} +#else +static inline void +nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval) +{ } +#endif + #define NFSDDBG_FACILITY NFSDDBG_PROC static u32 nfsd_attrmask[] = { @@ -191,9 +221,18 @@ static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) return nfserr_symlink; } +static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh *resfh) +{ + if (nfsd4_has_session(cstate)) + return; + fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, + &resfh->fh_handle); +} + static __be32 -do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) +do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { + struct svc_fh *current_fh = &cstate->current_fh; struct svc_fh *resfh; int accmode; __be32 status; @@ -230,6 +269,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o (u32 *)open->op_verf.data, &open->op_truncate, &open->op_created); + if (!status && open->op_label.len) + nfsd4_security_inode_setsecctx(resfh, &open->op_label, open->op_bmval); + /* * Following rfc 3530 14.2.16, use the returned bitmask * to indicate which attributes we used to store the @@ -252,11 +294,10 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o if (is_create_with_attrs(open) && open->op_acl != NULL) do_set_nfs4_acl(rqstp, resfh, open->op_acl, open->op_bmval); - /* set reply cache */ - fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, - &resfh->fh_handle); + nfsd4_set_open_owner_reply_cache(cstate, open, resfh); accmode = NFSD_MAY_NOP; - if (open->op_created) + if (open->op_created || + open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) accmode |= NFSD_MAY_OWNER_OVERRIDE; status = do_open_permission(rqstp, resfh, open, accmode); set_change_info(&open->op_cinfo, current_fh); @@ -268,9 +309,11 @@ out: } static __be32 -do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) +do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { + struct svc_fh *current_fh = &cstate->current_fh; __be32 status; + int accmode = 0; /* We don't know the target directory, and therefore can not * set the change info @@ -278,15 +321,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); - /* set replay cache */ - fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, - ¤t_fh->fh_handle); + nfsd4_set_open_owner_reply_cache(cstate, open, current_fh); open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && (open->op_iattr.ia_size == 0); + /* + * In the delegation case, the client is telling us about an + * open that it *already* performed locally, some time ago. We + * should let it succeed now if possible. + * + * In the case of a CLAIM_FH open, on the other hand, the client + * may be counting on us to enforce permissions (the Linux 4.1 + * client uses this for normal opens, for example). + */ + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH) + accmode = NFSD_MAY_OWNER_OVERRIDE; - status = do_open_permission(rqstp, current_fh, open, - NFSD_MAY_OWNER_OVERRIDE); + status = do_open_permission(rqstp, current_fh, open, accmode); return status; } @@ -351,6 +402,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } if (status) goto out; + if (open->op_xdr_error) { + status = open->op_xdr_error; + goto out; + } status = nfsd4_check_open_attributes(rqstp, cstate, open); if (status) @@ -368,8 +423,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: case NFS4_OPEN_CLAIM_NULL: - status = do_open_lookup(rqstp, &cstate->current_fh, - open); + status = do_open_lookup(rqstp, cstate, open); if (status) goto out; break; @@ -382,8 +436,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: - status = do_open_fhandle(rqstp, &cstate->current_fh, - open); + status = do_open_fhandle(rqstp, cstate, open); if (status) goto out; break; @@ -409,14 +462,33 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, WARN_ON(status && open->op_created); out: nfsd4_cleanup_open_state(open, status); - if (open->op_openowner) + if (open->op_openowner && !nfsd4_has_session(cstate)) cstate->replay_owner = &open->op_openowner->oo_owner; - else + nfsd4_bump_seqid(cstate, status); + if (!cstate->replay_owner) nfs4_unlock_state(); return status; } /* + * OPEN is the only seqid-mutating operation whose decoding can fail + * with a seqid-mutating error (specifically, decoding of user names in + * the attributes). Therefore we have to do some processing to look up + * the stateowner so that we can bump the seqid. + */ +static __be32 nfsd4_open_omfg(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_op *op) +{ + struct nfsd4_open *open = (struct nfsd4_open *)&op->u; + + if (!seqid_mutating_err(ntohl(op->status))) + return op->status; + if (nfsd4_has_session(cstate)) + return op->status; + open->op_xdr_error = op->status; + return nfsd4_open(rqstp, cstate, open); +} + +/* * filehandle-manipulating ops. */ static __be32 @@ -599,6 +671,9 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; + if (create->cr_label.len) + nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval); + if (create->cr_acl != NULL) do_set_nfs4_acl(rqstp, &resfh, create->cr_acl, create->cr_bmval); @@ -786,21 +861,11 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, rename->rn_snamelen, &cstate->current_fh, rename->rn_tname, rename->rn_tnamelen); - - /* the underlying filesystem returns different error's than required - * by NFSv4. both save_fh and current_fh have been verified.. */ - if (status == nfserr_isdir) - status = nfserr_exist; - else if ((status == nfserr_notdir) && - (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) && - S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode))) - status = nfserr_exist; - - if (!status) { - set_change_info(&rename->rn_sinfo, &cstate->current_fh); - set_change_info(&rename->rn_tinfo, &cstate->save_fh); - } - return status; + if (status) + return status; + set_change_info(&rename->rn_sinfo, &cstate->current_fh); + set_change_info(&rename->rn_tinfo, &cstate->save_fh); + return nfs_ok; } static __be32 @@ -888,6 +953,11 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, setattr->sa_acl); if (status) goto out; + if (setattr->sa_label.len) + status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh, + &setattr->sa_label); + if (status) + goto out; status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr, 0, (time_t)0); out: @@ -931,14 +1001,14 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, stateid, WR_STATE, &filp); - if (filp) - get_file(filp); - nfs4_unlock_state(); - if (status) { + nfs4_unlock_state(); dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); return status; } + if (filp) + get_file(filp); + nfs4_unlock_state(); cnt = write->wr_buflen; write->wr_how_written = write->wr_stable_how; @@ -1244,8 +1314,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, * for example, if there is a miscellaneous XDR error * it will be set to nfserr_bad_xdr. */ - if (op->status) + if (op->status) { + if (op->opnum == OP_OPEN) + op->status = nfsd4_open_omfg(rqstp, cstate, op); goto encode_op; + } /* We must be able to encode a successful response to * this operation, with enough room left over to encode a @@ -1282,12 +1355,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, if (op->status) goto encode_op; - if (opdesc->op_func) { - if (opdesc->op_get_currentstateid) - opdesc->op_get_currentstateid(cstate, &op->u); - op->status = opdesc->op_func(rqstp, cstate, &op->u); - } else - BUG_ON(op->status == nfs_ok); + if (opdesc->op_get_currentstateid) + opdesc->op_get_currentstateid(cstate, &op->u); + op->status = opdesc->op_func(rqstp, cstate, &op->u); if (!op->status) { if (opdesc->op_set_currentstateid) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 899ca26dd194..105a3b080d12 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -146,7 +146,7 @@ out_no_tfm: * then disable recovery tracking. */ static void -legacy_recdir_name_error(int error) +legacy_recdir_name_error(struct nfs4_client *clp, int error) { printk(KERN_ERR "NFSD: unable to generate recoverydir " "name (%d).\n", error); @@ -159,9 +159,7 @@ legacy_recdir_name_error(int error) if (error == -ENOENT) { printk(KERN_ERR "NFSD: disabling legacy clientid tracking. " "Reboot recovery will not function correctly!\n"); - - /* the argument is ignored by the legacy exit function */ - nfsd4_client_tracking_exit(NULL); + nfsd4_client_tracking_exit(clp->net); } } @@ -184,7 +182,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) - return legacy_recdir_name_error(status); + return legacy_recdir_name_error(clp, status); status = nfs4_save_creds(&original_cred); if (status < 0) @@ -242,11 +240,16 @@ struct name_list { struct list_head list; }; +struct nfs4_dir_ctx { + struct dir_context ctx; + struct list_head names; +}; + static int nfsd4_build_namelist(void *arg, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct list_head *names = arg; + struct nfs4_dir_ctx *ctx = arg; struct name_list *entry; if (namlen != HEXDIR_LEN - 1) @@ -256,7 +259,7 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen, return -ENOMEM; memcpy(entry->name, name, HEXDIR_LEN - 1); entry->name[HEXDIR_LEN - 1] = '\0'; - list_add(&entry->list, names); + list_add(&entry->list, &ctx->names); return 0; } @@ -265,7 +268,10 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) { const struct cred *original_cred; struct dentry *dir = nn->rec_file->f_path.dentry; - LIST_HEAD(names); + struct nfs4_dir_ctx ctx = { + .ctx.actor = nfsd4_build_namelist, + .names = LIST_HEAD_INIT(ctx.names) + }; int status; status = nfs4_save_creds(&original_cred); @@ -278,11 +284,11 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) return status; } - status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names); + status = iterate_dir(nn->rec_file, &ctx.ctx); mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); - while (!list_empty(&names)) { + while (!list_empty(&ctx.names)) { struct name_list *entry; - entry = list_entry(names.next, struct name_list, list); + entry = list_entry(ctx.names.next, struct name_list, list); if (!status) { struct dentry *dentry; dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); @@ -341,7 +347,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) - return legacy_recdir_name_error(status); + return legacy_recdir_name_error(clp, status); status = mnt_want_write_file(nn->rec_file); if (status) @@ -601,7 +607,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp) status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) { - legacy_recdir_name_error(status); + legacy_recdir_name_error(clp, status); return status; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2e27430b9070..280acef6f0dc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -42,6 +42,7 @@ #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/addr.h> #include "xdr4.h" +#include "xdr4cb.h" #include "vfs.h" #include "current_stateid.h" @@ -94,17 +95,33 @@ nfs4_lock_state(void) mutex_lock(&client_mutex); } -static void free_session(struct kref *); +static void free_session(struct nfsd4_session *); -/* Must be called under the client_lock */ -static void nfsd4_put_session_locked(struct nfsd4_session *ses) +static bool is_session_dead(struct nfsd4_session *ses) { - kref_put(&ses->se_ref, free_session); + return ses->se_flags & NFS4_SESSION_DEAD; } -static void nfsd4_get_session(struct nfsd4_session *ses) +void nfsd4_put_session(struct nfsd4_session *ses) +{ + if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses)) + free_session(ses); +} + +static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me) +{ + if (atomic_read(&ses->se_ref) > ref_held_by_me) + return nfserr_jukebox; + ses->se_flags |= NFS4_SESSION_DEAD; + return nfs_ok; +} + +static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses) { - kref_get(&ses->se_ref); + if (is_session_dead(ses)) + return nfserr_badsession; + atomic_inc(&ses->se_ref); + return nfs_ok; } void @@ -113,6 +130,90 @@ nfs4_unlock_state(void) mutex_unlock(&client_mutex); } +static bool is_client_expired(struct nfs4_client *clp) +{ + return clp->cl_time == 0; +} + +static __be32 mark_client_expired_locked(struct nfs4_client *clp) +{ + if (atomic_read(&clp->cl_refcount)) + return nfserr_jukebox; + clp->cl_time = 0; + return nfs_ok; +} + +static __be32 mark_client_expired(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + __be32 ret; + + spin_lock(&nn->client_lock); + ret = mark_client_expired_locked(clp); + spin_unlock(&nn->client_lock); + return ret; +} + +static __be32 get_client_locked(struct nfs4_client *clp) +{ + if (is_client_expired(clp)) + return nfserr_expired; + atomic_inc(&clp->cl_refcount); + return nfs_ok; +} + +/* must be called under the client_lock */ +static inline void +renew_client_locked(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (is_client_expired(clp)) { + WARN_ON(1); + printk("%s: client (clientid %08x/%08x) already expired\n", + __func__, + clp->cl_clientid.cl_boot, + clp->cl_clientid.cl_id); + return; + } + + dprintk("renewing client (clientid %08x/%08x)\n", + clp->cl_clientid.cl_boot, + clp->cl_clientid.cl_id); + list_move_tail(&clp->cl_lru, &nn->client_lru); + clp->cl_time = get_seconds(); +} + +static inline void +renew_client(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + spin_lock(&nn->client_lock); + renew_client_locked(clp); + spin_unlock(&nn->client_lock); +} + +static void put_client_renew_locked(struct nfs4_client *clp) +{ + if (!atomic_dec_and_test(&clp->cl_refcount)) + return; + if (!is_client_expired(clp)) + renew_client_locked(clp); +} + +void put_client_renew(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock)) + return; + if (!is_client_expired(clp)) + renew_client_locked(clp); + spin_unlock(&nn->client_lock); +} + + static inline u32 opaque_hashval(const void *ptr, int nbytes) { @@ -126,8 +227,6 @@ opaque_hashval(const void *ptr, int nbytes) return x; } -static struct list_head del_recall_lru; - static void nfsd4_free_file(struct nfs4_file *f) { kmem_cache_free(file_slab, f); @@ -137,7 +236,7 @@ static inline void put_nfs4_file(struct nfs4_file *fi) { if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { - list_del(&fi->fi_hash); + hlist_del(&fi->fi_hash); spin_unlock(&recall_lock); iput(fi->fi_inode); nfsd4_free_file(fi); @@ -181,7 +280,7 @@ static unsigned int file_hashval(struct inode *ino) return hash_ptr(ino, FILE_HASH_BITS); } -static struct list_head file_hashtbl[FILE_HASH_SIZE]; +static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) { @@ -210,13 +309,7 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) { if (atomic_dec_and_test(&fp->fi_access[oflag])) { nfs4_file_put_fd(fp, oflag); - /* - * It's also safe to get rid of the RDWR open *if* - * we no longer have need of the other kind of access - * or if we already have the other kind of open: - */ - if (fp->fi_fds[1-oflag] - || atomic_read(&fp->fi_access[1 - oflag]) == 0) + if (atomic_read(&fp->fi_access[1 - oflag]) == 0) nfs4_file_put_fd(fp, O_RDWR); } } @@ -234,7 +327,6 @@ static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) { struct idr *stateids = &cl->cl_stateids; - static int min_stateid = 0; struct nfs4_stid *stid; int new_id; @@ -242,7 +334,7 @@ kmem_cache *slab) if (!stid) return NULL; - new_id = idr_alloc(stateids, stid, min_stateid, 0, GFP_KERNEL); + new_id = idr_alloc_cyclic(stateids, stid, 0, 0, GFP_KERNEL); if (new_id < 0) goto out_free; stid->sc_client = cl; @@ -261,13 +353,9 @@ kmem_cache *slab) * amount of time until an id is reused, by ensuring they always * "increase" (mod INT_MAX): */ - - min_stateid = new_id+1; - if (min_stateid == INT_MAX) - min_stateid = 0; return stid; out_free: - kfree(stid); + kmem_cache_free(slab, stid); return NULL; } @@ -277,19 +365,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) } static struct nfs4_delegation * -alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type) +alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh) { struct nfs4_delegation *dp; struct nfs4_file *fp = stp->st_file; dprintk("NFSD alloc_init_deleg\n"); - /* - * Major work on the lease subsystem (for example, to support - * calbacks on stat) will be required before we can support - * write delegations properly. - */ - if (type != NFS4_OPEN_DELEGATE_READ) - return NULL; if (fp->fi_had_conflict) return NULL; if (num_delegations > max_delegations) @@ -310,7 +391,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv INIT_LIST_HEAD(&dp->dl_recall_lru); get_nfs4_file(fp); dp->dl_file = fp; - dp->dl_type = type; + dp->dl_type = NFS4_OPEN_DELEGATE_READ; fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); dp->dl_time = 0; atomic_set(&dp->dl_count, 1); @@ -318,21 +399,18 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv return dp; } -static void free_stid(struct nfs4_stid *s, struct kmem_cache *slab) +static void remove_stid(struct nfs4_stid *s) { struct idr *stateids = &s->sc_client->cl_stateids; idr_remove(stateids, s->sc_stateid.si_opaque.so_id); - kmem_cache_free(slab, s); } void nfs4_put_delegation(struct nfs4_delegation *dp) { if (atomic_dec_and_test(&dp->dl_count)) { - dprintk("NFSD: freeing dp %p\n",dp); - put_nfs4_file(dp->dl_file); - free_stid(&dp->dl_stid, deleg_slab); + kmem_cache_free(deleg_slab, dp); num_delegations--; } } @@ -356,16 +434,45 @@ static void unhash_stid(struct nfs4_stid *s) static void unhash_delegation(struct nfs4_delegation *dp) { - unhash_stid(&dp->dl_stid); list_del_init(&dp->dl_perclnt); spin_lock(&recall_lock); list_del_init(&dp->dl_perfile); list_del_init(&dp->dl_recall_lru); spin_unlock(&recall_lock); nfs4_put_deleg_lease(dp->dl_file); + put_nfs4_file(dp->dl_file); + dp->dl_file = NULL; +} + + + +static void destroy_revoked_delegation(struct nfs4_delegation *dp) +{ + list_del_init(&dp->dl_recall_lru); + remove_stid(&dp->dl_stid); + nfs4_put_delegation(dp); +} + +static void destroy_delegation(struct nfs4_delegation *dp) +{ + unhash_delegation(dp); + remove_stid(&dp->dl_stid); nfs4_put_delegation(dp); } +static void revoke_delegation(struct nfs4_delegation *dp) +{ + struct nfs4_client *clp = dp->dl_stid.sc_client; + + if (clp->cl_minorversion == 0) + destroy_delegation(dp); + else { + unhash_delegation(dp); + dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; + list_add(&dp->dl_recall_lru, &clp->cl_revoked); + } +} + /* * SETCLIENTID state */ @@ -506,7 +613,8 @@ static void close_generic_stateid(struct nfs4_ol_stateid *stp) static void free_generic_stateid(struct nfs4_ol_stateid *stp) { - free_stid(&stp->st_stid, stateid_slab); + remove_stid(&stp->st_stid); + kmem_cache_free(stateid_slab, stp); } static void release_lock_stateid(struct nfs4_ol_stateid *stp) @@ -622,6 +730,28 @@ dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) } #endif +/* + * Bump the seqid on cstate->replay_owner, and clear replay_owner if it + * won't be used for replay. + */ +void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr) +{ + struct nfs4_stateowner *so = cstate->replay_owner; + + if (nfserr == nfserr_replay_me) + return; + + if (!seqid_mutating_err(ntohl(nfserr))) { + cstate->replay_owner = NULL; + return; + } + if (!so) + return; + if (so->so_is_open_owner) + release_last_closed_stateid(openowner(so)); + so->so_seqid++; + return; +} static void gen_sessionid(struct nfsd4_session *ses) @@ -662,17 +792,15 @@ free_session_slots(struct nfsd4_session *ses) * We don't actually need to cache the rpc and session headers, so we * can allocate a little less for each slot: */ -static inline int slot_bytes(struct nfsd4_channel_attrs *ca) +static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca) { - return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; -} + u32 size; -static int nfsd4_sanitize_slot_size(u32 size) -{ - size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */ - size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE); - - return size; + if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ) + size = 0; + else + size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; + return size + sizeof(struct nfsd4_slot); } /* @@ -680,12 +808,12 @@ static int nfsd4_sanitize_slot_size(u32 size) * re-negotiate active sessions and reduce their slot usage to make * room for new connections. For now we just fail the create session. */ -static int nfsd4_get_drc_mem(int slotsize, u32 num) +static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) { + u32 slotsize = slot_bytes(ca); + u32 num = ca->maxreqs; int avail; - num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION); - spin_lock(&nfsd_drc_lock); avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, nfsd_drc_max_mem - nfsd_drc_mem_used); @@ -696,15 +824,19 @@ static int nfsd4_get_drc_mem(int slotsize, u32 num) return num; } -static void nfsd4_put_drc_mem(int slotsize, int num) +static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca) { + int slotsize = slot_bytes(ca); + spin_lock(&nfsd_drc_lock); - nfsd_drc_mem_used -= slotsize * num; + nfsd_drc_mem_used -= slotsize * ca->maxreqs; spin_unlock(&nfsd_drc_lock); } -static struct nfsd4_session *__alloc_session(int slotsize, int numslots) +static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs) { + int numslots = attrs->maxreqs; + int slotsize = slot_bytes(attrs); struct nfsd4_session *new; int mem, i; @@ -717,8 +849,7 @@ static struct nfsd4_session *__alloc_session(int slotsize, int numslots) return NULL; /* allocate each struct nfsd4_slot and data cache in one piece */ for (i = 0; i < numslots; i++) { - mem = sizeof(struct nfsd4_slot) + slotsize; - new->se_slots[i] = kzalloc(mem, GFP_KERNEL); + new->se_slots[i] = kzalloc(slotsize, GFP_KERNEL); if (!new->se_slots[i]) goto out_free; } @@ -730,21 +861,6 @@ out_free: return NULL; } -static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, - struct nfsd4_channel_attrs *req, - int numslots, int slotsize, - struct nfsd_net *nn) -{ - u32 maxrpc = nn->nfsd_serv->sv_max_mesg; - - new->maxreqs = numslots; - new->maxresp_cached = min_t(u32, req->maxresp_cached, - slotsize + NFSD_MIN_HDR_SEQ_SZ); - new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc); - new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc); - new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND); -} - static void free_conn(struct nfsd4_conn *c) { svc_xprt_put(c->cn_xprt); @@ -761,8 +877,8 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u) list_del(&c->cn_persession); free_conn(c); } - spin_unlock(&clp->cl_lock); nfsd4_probe_callback(clp); + spin_unlock(&clp->cl_lock); } static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags) @@ -846,59 +962,20 @@ static void nfsd4_del_conns(struct nfsd4_session *s) static void __free_session(struct nfsd4_session *ses) { - nfsd4_put_drc_mem(slot_bytes(&ses->se_fchannel), ses->se_fchannel.maxreqs); free_session_slots(ses); kfree(ses); } -static void free_session(struct kref *kref) +static void free_session(struct nfsd4_session *ses) { - struct nfsd4_session *ses; - struct nfsd_net *nn; - - ses = container_of(kref, struct nfsd4_session, se_ref); - nn = net_generic(ses->se_client->net, nfsd_net_id); + struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); nfsd4_del_conns(ses); + nfsd4_put_drc_mem(&ses->se_fchannel); __free_session(ses); } -void nfsd4_put_session(struct nfsd4_session *ses) -{ - struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id); - - spin_lock(&nn->client_lock); - nfsd4_put_session_locked(ses); - spin_unlock(&nn->client_lock); -} - -static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan, - struct nfsd_net *nn) -{ - struct nfsd4_session *new; - int numslots, slotsize; - /* - * Note decreasing slot size below client's request may - * make it difficult for client to function correctly, whereas - * decreasing the number of slots will (just?) affect - * performance. When short on memory we therefore prefer to - * decrease number of slots instead of their size. - */ - slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached); - numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs); - if (numslots < 1) - return NULL; - - new = __alloc_session(slotsize, numslots); - if (!new) { - nfsd4_put_drc_mem(slotsize, numslots); - return NULL; - } - init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize, nn); - return new; -} - static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses) { int idx; @@ -913,7 +990,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru new->se_flags = cses->flags; new->se_cb_prog = cses->callback_prog; new->se_cb_sec = cses->cb_sec; - kref_init(&new->se_ref); + atomic_set(&new->se_ref, 0); idx = hash_sessionid(&new->se_sessionid); spin_lock(&nn->client_lock); list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]); @@ -921,7 +998,8 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&clp->cl_lock); spin_unlock(&nn->client_lock); - + memcpy(&new->se_fchannel, &cses->fore_channel, + sizeof(struct nfsd4_channel_attrs)); if (cses->flags & SESSION4_BACK_CHAN) { struct sockaddr *sa = svc_addr(rqstp); /* @@ -968,38 +1046,6 @@ unhash_session(struct nfsd4_session *ses) spin_unlock(&ses->se_client->cl_lock); } -/* must be called under the client_lock */ -static inline void -renew_client_locked(struct nfs4_client *clp) -{ - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - - if (is_client_expired(clp)) { - WARN_ON(1); - printk("%s: client (clientid %08x/%08x) already expired\n", - __func__, - clp->cl_clientid.cl_boot, - clp->cl_clientid.cl_id); - return; - } - - dprintk("renewing client (clientid %08x/%08x)\n", - clp->cl_clientid.cl_boot, - clp->cl_clientid.cl_id); - list_move_tail(&clp->cl_lru, &nn->client_lru); - clp->cl_time = get_seconds(); -} - -static inline void -renew_client(struct nfs4_client *clp) -{ - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - - spin_lock(&nn->client_lock); - renew_client_locked(clp); - spin_unlock(&nn->client_lock); -} - /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ static int STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) @@ -1043,7 +1089,8 @@ free_client(struct nfs4_client *clp) ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, se_perclnt); list_del(&ses->se_perclnt); - nfsd4_put_session_locked(ses); + WARN_ON_ONCE(atomic_read(&ses->se_ref)); + free_session(ses); } free_svc_cred(&clp->cl_cred); kfree(clp->cl_name.data); @@ -1051,29 +1098,12 @@ free_client(struct nfs4_client *clp) kfree(clp); } -void -release_session_client(struct nfsd4_session *session) -{ - struct nfs4_client *clp = session->se_client; - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - - if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock)) - return; - if (is_client_expired(clp)) { - free_client(clp); - session->se_client = NULL; - } else - renew_client_locked(clp); - spin_unlock(&nn->client_lock); -} - /* must be called under the client_lock */ static inline void unhash_client_locked(struct nfs4_client *clp) { struct nfsd4_session *ses; - mark_client_expired(clp); list_del(&clp->cl_lru); spin_lock(&clp->cl_lock); list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) @@ -1099,7 +1129,7 @@ destroy_client(struct nfs4_client *clp) spin_unlock(&recall_lock); while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); - unhash_delegation(dp); + destroy_delegation(dp); } while (!list_empty(&clp->cl_openowners)) { oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); @@ -1115,8 +1145,8 @@ destroy_client(struct nfs4_client *clp) rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); spin_lock(&nn->client_lock); unhash_client_locked(clp); - if (atomic_read(&clp->cl_refcount) == 0) - free_client(clp); + WARN_ON_ONCE(atomic_read(&clp->cl_refcount)); + free_client(clp); spin_unlock(&nn->client_lock); } @@ -1152,6 +1182,9 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source) target->cr_gid = source->cr_gid; target->cr_group_info = source->cr_group_info; get_group_info(target->cr_group_info); + target->cr_gss_mech = source->cr_gss_mech; + if (source->cr_gss_mech) + gss_mech_get(source->cr_gss_mech); return 0; } @@ -1226,6 +1259,31 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2) return 0 == strcmp(cr1->cr_principal, cr2->cr_principal); } +static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp) +{ + struct svc_cred *cr = &rqstp->rq_cred; + u32 service; + + service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor); + return service == RPC_GSS_SVC_INTEGRITY || + service == RPC_GSS_SVC_PRIVACY; +} + +static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) +{ + struct svc_cred *cr = &rqstp->rq_cred; + + if (!cl->cl_mach_cred) + return true; + if (cl->cl_cred.cr_gss_mech != cr->cr_gss_mech) + return false; + if (!svc_rqst_integrity_protected(rqstp)) + return false; + if (!cr->cr_principal) + return false; + return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal); +} + static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn) { static u32 current_clientid = 1; @@ -1295,6 +1353,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, INIT_LIST_HEAD(&clp->cl_delegations); INIT_LIST_HEAD(&clp->cl_lru); INIT_LIST_HEAD(&clp->cl_callbacks); + INIT_LIST_HEAD(&clp->cl_revoked); spin_lock_init(&clp->cl_lock); nfsd4_init_callback(&clp->cl_cb_null); clp->cl_time = get_seconds(); @@ -1376,12 +1435,12 @@ move_to_confirmed(struct nfs4_client *clp) } static struct nfs4_client * -find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) +find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions) { struct nfs4_client *clp; unsigned int idhashval = clientid_hashval(clid->cl_id); - list_for_each_entry(clp, &nn->conf_id_hashtbl[idhashval], cl_idhash) { + list_for_each_entry(clp, &tbl[idhashval], cl_idhash) { if (same_clid(&clp->cl_clientid, clid)) { if ((bool)clp->cl_minorversion != sessions) return NULL; @@ -1393,19 +1452,19 @@ find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) } static struct nfs4_client * +find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) +{ + struct list_head *tbl = nn->conf_id_hashtbl; + + return find_client_in_id_table(tbl, clid, sessions); +} + +static struct nfs4_client * find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) { - struct nfs4_client *clp; - unsigned int idhashval = clientid_hashval(clid->cl_id); + struct list_head *tbl = nn->unconf_id_hashtbl; - list_for_each_entry(clp, &nn->unconf_id_hashtbl[idhashval], cl_idhash) { - if (same_clid(&clp->cl_clientid, clid)) { - if ((bool)clp->cl_minorversion != sessions) - return NULL; - return clp; - } - } - return NULL; + return find_client_in_id_table(tbl, clid, sessions); } static bool clp_used_exchangeid(struct nfs4_client *clp) @@ -1602,15 +1661,16 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, if (exid->flags & ~EXCHGID4_FLAG_MASK_A) return nfserr_inval; - /* Currently only support SP4_NONE */ switch (exid->spa_how) { + case SP4_MACH_CRED: + if (!svc_rqst_integrity_protected(rqstp)) + return nfserr_inval; case SP4_NONE: break; default: /* checked by xdr code */ WARN_ON_ONCE(1); case SP4_SSV: - case SP4_MACH_CRED: - return nfserr_serverfault; /* no excuse :-/ */ + return nfserr_encr_alg_unsupp; } /* Cases below refer to rfc 5661 section 18.35.4: */ @@ -1625,6 +1685,10 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, status = nfserr_inval; goto out; } + if (!mach_creds_match(conf, rqstp)) { + status = nfserr_wrong_cred; + goto out; + } if (!creds_match) { /* case 9 */ status = nfserr_perm; goto out; @@ -1671,7 +1735,8 @@ out_new: status = nfserr_jukebox; goto out; } - new->cl_minorversion = 1; + new->cl_minorversion = cstate->minorversion; + new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED); gen_clid(new, nn); add_to_unconfirmed(new); @@ -1750,10 +1815,73 @@ nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses, /* seqid, slotID, slotID, slotID, status */ \ 5 ) * sizeof(__be32)) -static bool check_forechannel_attrs(struct nfsd4_channel_attrs fchannel) +static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn) { - return fchannel.maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ - || fchannel.maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ; + u32 maxrpc = nn->nfsd_serv->sv_max_mesg; + + if (ca->maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ) + return nfserr_toosmall; + if (ca->maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ) + return nfserr_toosmall; + ca->headerpadsz = 0; + ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc); + ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc); + ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND); + ca->maxresp_cached = min_t(u32, ca->maxresp_cached, + NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ); + ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION); + /* + * Note decreasing slot size below client's request may make it + * difficult for client to function correctly, whereas + * decreasing the number of slots will (just?) affect + * performance. When short on memory we therefore prefer to + * decrease number of slots instead of their size. Clients that + * request larger slots than they need will get poor results: + */ + ca->maxreqs = nfsd4_get_drc_mem(ca); + if (!ca->maxreqs) + return nfserr_jukebox; + + return nfs_ok; +} + +static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca) +{ + ca->headerpadsz = 0; + + /* + * These RPC_MAX_HEADER macros are overkill, especially since we + * don't even do gss on the backchannel yet. But this is still + * less than 1k. Tighten up this estimate in the unlikely event + * it turns out to be a problem for some client: + */ + if (ca->maxreq_sz < NFS4_enc_cb_recall_sz + RPC_MAX_HEADER_WITH_AUTH) + return nfserr_toosmall; + if (ca->maxresp_sz < NFS4_dec_cb_recall_sz + RPC_MAX_REPHEADER_WITH_AUTH) + return nfserr_toosmall; + ca->maxresp_cached = 0; + if (ca->maxops < 2) + return nfserr_toosmall; + + return nfs_ok; +} + +static __be32 nfsd4_check_cb_sec(struct nfsd4_cb_sec *cbs) +{ + switch (cbs->flavor) { + case RPC_AUTH_NULL: + case RPC_AUTH_UNIX: + return nfs_ok; + default: + /* + * GSS case: the spec doesn't allow us to return this + * error. But it also doesn't allow us not to support + * GSS. + * I'd rather this fail hard than return some error the + * client might think it can already handle: + */ + return nfserr_encr_alg_unsupp; + } } __be32 @@ -1771,12 +1899,19 @@ nfsd4_create_session(struct svc_rqst *rqstp, if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) return nfserr_inval; - if (check_forechannel_attrs(cr_ses->fore_channel)) - return nfserr_toosmall; - new = alloc_session(&cr_ses->fore_channel, nn); - if (!new) - return nfserr_jukebox; + status = nfsd4_check_cb_sec(&cr_ses->cb_sec); + if (status) + return status; + status = check_forechannel_attrs(&cr_ses->fore_channel, nn); + if (status) + return status; + status = check_backchannel_attrs(&cr_ses->back_channel); + if (status) + return status; status = nfserr_jukebox; + new = alloc_session(&cr_ses->fore_channel); + if (!new) + goto out_release_drc_mem; conn = alloc_conn_from_crses(rqstp, cr_ses); if (!conn) goto out_free_session; @@ -1784,8 +1919,12 @@ nfsd4_create_session(struct svc_rqst *rqstp, nfs4_lock_state(); unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn); conf = find_confirmed_client(&cr_ses->clientid, true, nn); + WARN_ON_ONCE(conf && unconf); if (conf) { + status = nfserr_wrong_cred; + if (!mach_creds_match(conf, rqstp)) + goto out_free_conn; cs_slot = &conf->cl_cs_slot; status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); if (status == nfserr_replay_cache) { @@ -1802,6 +1941,9 @@ nfsd4_create_session(struct svc_rqst *rqstp, status = nfserr_clid_inuse; goto out_free_conn; } + status = nfserr_wrong_cred; + if (!mach_creds_match(unconf, rqstp)) + goto out_free_conn; cs_slot = &unconf->cl_cs_slot; status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); if (status) { @@ -1810,8 +1952,12 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_conn; } old = find_confirmed_client_by_name(&unconf->cl_name, nn); - if (old) + if (old) { + status = mark_client_expired(old); + if (status) + goto out_free_conn; expire_client(old); + } move_to_confirmed(unconf); conf = unconf; } else { @@ -1830,23 +1976,21 @@ nfsd4_create_session(struct svc_rqst *rqstp, memcpy(cr_ses->sessionid.data, new->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); - memcpy(&cr_ses->fore_channel, &new->se_fchannel, - sizeof(struct nfsd4_channel_attrs)); cs_slot->sl_seqid++; cr_ses->seqid = cs_slot->sl_seqid; /* cache solo and embedded create sessions under the state lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); nfs4_unlock_state(); -out: - dprintk("%s returns %d\n", __func__, ntohl(status)); return status; out_free_conn: nfs4_unlock_state(); free_conn(conn); out_free_session: __free_session(new); - goto out; +out_release_drc_mem: + nfsd4_put_drc_mem(&cr_ses->fore_channel); + return status; } static __be32 nfsd4_map_bcts_dir(u32 *dir) @@ -1867,7 +2011,11 @@ __be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state { struct nfsd4_session *session = cstate->session; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + __be32 status; + status = nfsd4_check_cb_sec(&bc->bc_cb_sec); + if (status) + return status; spin_lock(&nn->client_lock); session->se_cb_prog = bc->bc_cb_program; session->se_cb_sec = bc->bc_cb_sec; @@ -1884,30 +2032,33 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, { __be32 status; struct nfsd4_conn *conn; + struct nfsd4_session *session; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (!nfsd4_last_compound_op(rqstp)) return nfserr_not_only_op; + nfs4_lock_state(); spin_lock(&nn->client_lock); - cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp)); - /* Sorta weird: we only need the refcnt'ing because new_conn acquires - * client_lock iself: */ - if (cstate->session) { - nfsd4_get_session(cstate->session); - atomic_inc(&cstate->session->se_client->cl_refcount); - } + session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp)); spin_unlock(&nn->client_lock); - if (!cstate->session) - return nfserr_badsession; - + status = nfserr_badsession; + if (!session) + goto out; + status = nfserr_wrong_cred; + if (!mach_creds_match(session->se_client, rqstp)) + goto out; status = nfsd4_map_bcts_dir(&bcts->dir); if (status) - return status; + goto out; conn = alloc_conn(rqstp, bcts->dir); + status = nfserr_jukebox; if (!conn) - return nfserr_jukebox; - nfsd4_init_conn(rqstp, conn, cstate->session); - return nfs_ok; + goto out; + nfsd4_init_conn(rqstp, conn, session); + status = nfs_ok; +out: + nfs4_unlock_state(); + return status; } static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) @@ -1923,42 +2074,43 @@ nfsd4_destroy_session(struct svc_rqst *r, struct nfsd4_destroy_session *sessionid) { struct nfsd4_session *ses; - __be32 status = nfserr_badsession; + __be32 status; + int ref_held_by_me = 0; struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id); - /* Notes: - * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid - * - Should we return nfserr_back_chan_busy if waiting for - * callbacks on to-be-destroyed session? - * - Do we need to clear any callback info from previous session? - */ - + nfs4_lock_state(); + status = nfserr_not_only_op; if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) { if (!nfsd4_last_compound_op(r)) - return nfserr_not_only_op; + goto out; + ref_held_by_me++; } dump_sessionid(__func__, &sessionid->sessionid); spin_lock(&nn->client_lock); ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r)); - if (!ses) { - spin_unlock(&nn->client_lock); - goto out; - } - + status = nfserr_badsession; + if (!ses) + goto out_client_lock; + status = nfserr_wrong_cred; + if (!mach_creds_match(ses->se_client, r)) + goto out_client_lock; + nfsd4_get_session_locked(ses); + status = mark_session_dead_locked(ses, 1 + ref_held_by_me); + if (status) + goto out_put_session; unhash_session(ses); spin_unlock(&nn->client_lock); - nfs4_lock_state(); nfsd4_probe_callback_sync(ses->se_client); - nfs4_unlock_state(); spin_lock(&nn->client_lock); - nfsd4_del_conns(ses); - nfsd4_put_session_locked(ses); - spin_unlock(&nn->client_lock); status = nfs_ok; +out_put_session: + nfsd4_put_session(ses); +out_client_lock: + spin_unlock(&nn->client_lock); out: - dprintk("%s returns %d\n", __func__, ntohl(status)); + nfs4_unlock_state(); return status; } @@ -1974,26 +2126,31 @@ static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_s return NULL; } -static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses) +static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses) { struct nfs4_client *clp = ses->se_client; struct nfsd4_conn *c; + __be32 status = nfs_ok; int ret; spin_lock(&clp->cl_lock); c = __nfsd4_find_conn(new->cn_xprt, ses); - if (c) { - spin_unlock(&clp->cl_lock); - free_conn(new); - return; - } + if (c) + goto out_free; + status = nfserr_conn_not_bound_to_session; + if (clp->cl_mach_cred) + goto out_free; __nfsd4_hash_conn(new, ses); spin_unlock(&clp->cl_lock); ret = nfsd4_register_conn(new); if (ret) /* oops; xprt is already down: */ nfsd4_conn_lost(&new->cn_xpt_user); - return; + return nfs_ok; +out_free: + spin_unlock(&clp->cl_lock); + free_conn(new); + return status; } static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session) @@ -2018,6 +2175,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, { struct nfsd4_compoundres *resp = rqstp->rq_resp; struct nfsd4_session *session; + struct nfs4_client *clp; struct nfsd4_slot *slot; struct nfsd4_conn *conn; __be32 status; @@ -2038,19 +2196,26 @@ nfsd4_sequence(struct svc_rqst *rqstp, status = nfserr_badsession; session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp)); if (!session) - goto out; + goto out_no_session; + clp = session->se_client; + status = get_client_locked(clp); + if (status) + goto out_no_session; + status = nfsd4_get_session_locked(session); + if (status) + goto out_put_client; status = nfserr_too_many_ops; if (nfsd4_session_too_many_ops(rqstp, session)) - goto out; + goto out_put_session; status = nfserr_req_too_big; if (nfsd4_request_too_big(rqstp, session)) - goto out; + goto out_put_session; status = nfserr_badslot; if (seq->slotid >= session->se_fchannel.maxreqs) - goto out; + goto out_put_session; slot = session->se_slots[seq->slotid]; dprintk("%s: slotid %d\n", __func__, seq->slotid); @@ -2065,7 +2230,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, if (status == nfserr_replay_cache) { status = nfserr_seq_misordered; if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED)) - goto out; + goto out_put_session; cstate->slot = slot; cstate->session = session; /* Return the cached reply status and set cstate->status @@ -2075,10 +2240,12 @@ nfsd4_sequence(struct svc_rqst *rqstp, goto out; } if (status) - goto out; + goto out_put_session; - nfsd4_sequence_check_conn(conn, session); + status = nfsd4_sequence_check_conn(conn, session); conn = NULL; + if (status) + goto out_put_session; /* Success! bump slot seqid */ slot->sl_seqid = seq->seqid; @@ -2092,27 +2259,27 @@ nfsd4_sequence(struct svc_rqst *rqstp, cstate->session = session; out: - /* Hold a session reference until done processing the compound. */ - if (cstate->session) { - struct nfs4_client *clp = session->se_client; - - nfsd4_get_session(cstate->session); - atomic_inc(&clp->cl_refcount); - switch (clp->cl_cb_state) { - case NFSD4_CB_DOWN: - seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; - break; - case NFSD4_CB_FAULT: - seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; - break; - default: - seq->status_flags = 0; - } + switch (clp->cl_cb_state) { + case NFSD4_CB_DOWN: + seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; + break; + case NFSD4_CB_FAULT: + seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; + break; + default: + seq->status_flags = 0; } + if (!list_empty(&clp->cl_revoked)) + seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; +out_no_session: kfree(conn); spin_unlock(&nn->client_lock); - dprintk("%s: return %d\n", __func__, ntohl(status)); return status; +out_put_session: + nfsd4_put_session(session); +out_put_client: + put_client_renew_locked(clp); + goto out_no_session; } __be32 @@ -2125,17 +2292,12 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta nfs4_lock_state(); unconf = find_unconfirmed_client(&dc->clientid, true, nn); conf = find_confirmed_client(&dc->clientid, true, nn); + WARN_ON_ONCE(conf && unconf); if (conf) { clp = conf; - if (!is_client_expired(conf) && client_has_state(conf)) { - status = nfserr_clientid_busy; - goto out; - } - - /* rfc5661 18.50.3 */ - if (cstate->session && conf == cstate->session->se_client) { + if (client_has_state(conf)) { status = nfserr_clientid_busy; goto out; } @@ -2145,11 +2307,13 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta status = nfserr_stale_clientid; goto out; } - + if (!mach_creds_match(clp, rqstp)) { + status = nfserr_wrong_cred; + goto out; + } expire_client(clp); out: nfs4_unlock_state(); - dprintk("%s return %d\n", __func__, ntohl(status)); return status; } @@ -2287,8 +2451,12 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, expire_client(unconf); } else { /* case 3: normal case; new or rebooted client */ conf = find_confirmed_client_by_name(&unconf->cl_name, nn); - if (conf) + if (conf) { + status = mark_client_expired(conf); + if (status) + goto out; expire_client(conf); + } move_to_confirmed(unconf); nfsd4_probe_callback(unconf); } @@ -2308,7 +2476,6 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino) unsigned int hashval = file_hashval(ino); atomic_set(&fp->fi_ref, 1); - INIT_LIST_HEAD(&fp->fi_hash); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); fp->fi_inode = igrab(ino); @@ -2317,7 +2484,7 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino) memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); spin_lock(&recall_lock); - list_add(&fp->fi_hash, &file_hashtbl[hashval]); + hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]); spin_unlock(&recall_lock); } @@ -2503,7 +2670,7 @@ find_file(struct inode *ino) struct nfs4_file *fp; spin_lock(&recall_lock); - list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { + hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { if (fp->fi_inode == ino) { get_nfs4_file(fp); spin_unlock(&recall_lock); @@ -2526,8 +2693,6 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) struct nfs4_ol_stateid *stp; __be32 ret; - dprintk("NFSD: nfs4_share_conflict\n"); - fp = find_file(ino); if (!fp) return nfs_ok; @@ -2546,6 +2711,9 @@ out: static void nfsd_break_one_deleg(struct nfs4_delegation *dp) { + struct nfs4_client *clp = dp->dl_stid.sc_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + /* We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease * callback (and since the lease code is serialized by the kernel @@ -2553,15 +2721,15 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) * it's safe to take a reference: */ atomic_inc(&dp->dl_count); - list_add_tail(&dp->dl_recall_lru, &del_recall_lru); + list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); - /* only place dl_time is set. protected by lock_flocks*/ + /* Only place dl_time is set; protected by i_lock: */ dp->dl_time = get_seconds(); nfsd4_cb_recall(dp); } -/* Called from break_lease() with lock_flocks() held. */ +/* Called from break_lease() with i_lock held. */ static void nfsd_break_deleg_cb(struct file_lock *fl) { struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; @@ -2699,7 +2867,7 @@ static bool nfsd4_is_deleg_cur(struct nfsd4_open *open) } static __be32 -nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open, +nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, struct nfs4_delegation **dp) { int flags; @@ -2850,13 +3018,13 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f return fl; } -static int nfs4_setlease(struct nfs4_delegation *dp, int flag) +static int nfs4_setlease(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_file; struct file_lock *fl; int status; - fl = nfs4_alloc_init_lease(dp, flag); + fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ); if (!fl) return -ENOMEM; fl->fl_file = find_readable_file(fp); @@ -2874,12 +3042,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag) return 0; } -static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) +static int nfs4_set_delegation(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_file; if (!fp->fi_lease) - return nfs4_setlease(dp, flag); + return nfs4_setlease(dp); spin_lock(&recall_lock); if (fp->fi_had_conflict) { spin_unlock(&recall_lock); @@ -2915,6 +3083,9 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) /* * Attempt to hand out a delegation. + * + * Note we don't support write delegations, and won't until the vfs has + * proper support for them. */ static void nfs4_open_delegation(struct net *net, struct svc_fh *fh, @@ -2923,39 +3094,45 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh, struct nfs4_delegation *dp; struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner); int cb_up; - int status = 0, flag = 0; + int status = 0; cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); - flag = NFS4_OPEN_DELEGATE_NONE; open->op_recall = 0; switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_PREVIOUS: if (!cb_up) open->op_recall = 1; - flag = open->op_delegate_type; - if (flag == NFS4_OPEN_DELEGATE_NONE) - goto out; + if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ) + goto out_no_deleg; break; case NFS4_OPEN_CLAIM_NULL: - /* Let's not give out any delegations till everyone's - * had the chance to reclaim theirs.... */ + /* + * Let's not give out any delegations till everyone's + * had the chance to reclaim theirs.... + */ if (locks_in_grace(net)) - goto out; + goto out_no_deleg; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) - goto out; + goto out_no_deleg; + /* + * Also, if the file was opened for write or + * create, there's a good chance the client's + * about to write to it, resulting in an + * immediate recall (since we don't support + * write delegations): + */ if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) - flag = NFS4_OPEN_DELEGATE_WRITE; - else - flag = NFS4_OPEN_DELEGATE_READ; + goto out_no_deleg; + if (open->op_create == NFS4_OPEN_CREATE) + goto out_no_deleg; break; default: - goto out; + goto out_no_deleg; } - - dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag); + dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh); if (dp == NULL) goto out_no_deleg; - status = nfs4_set_delegation(dp, flag); + status = nfs4_set_delegation(dp); if (status) goto out_free; @@ -2963,24 +3140,23 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh, dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", STATEID_VAL(&dp->dl_stid.sc_stateid)); -out: - open->op_delegate_type = flag; - if (flag == NFS4_OPEN_DELEGATE_NONE) { - if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && - open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) - dprintk("NFSD: WARNING: refusing delegation reclaim\n"); - - /* 4.1 client asking for a delegation? */ - if (open->op_deleg_want) - nfsd4_open_deleg_none_ext(open, status); - } + open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; return; out_free: unhash_stid(&dp->dl_stid); nfs4_put_delegation(dp); out_no_deleg: - flag = NFS4_OPEN_DELEGATE_NONE; - goto out; + open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE; + if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && + open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) { + dprintk("NFSD: WARNING: refusing delegation reclaim\n"); + open->op_recall = 1; + } + + /* 4.1 client asking for a delegation? */ + if (open->op_deleg_want) + nfsd4_open_deleg_none_ext(open, status); + return; } static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, @@ -3024,7 +3200,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (fp) { if ((status = nfs4_check_open(fp, open, &stp))) goto out; - status = nfs4_check_deleg(cl, fp, open, &dp); + status = nfs4_check_deleg(cl, open, &dp); if (status) goto out; } else { @@ -3202,13 +3378,12 @@ nfs4_laundromat(struct nfsd_net *nn) clientid_val = t; break; } - if (atomic_read(&clp->cl_refcount)) { + if (mark_client_expired_locked(clp)) { dprintk("NFSD: client in use (clientid %08x)\n", clp->cl_clientid.cl_id); continue; } - unhash_client_locked(clp); - list_add(&clp->cl_lru, &reaplist); + list_move(&clp->cl_lru, &reaplist); } spin_unlock(&nn->client_lock); list_for_each_safe(pos, next, &reaplist) { @@ -3218,7 +3393,7 @@ nfs4_laundromat(struct nfsd_net *nn) expire_client(clp); } spin_lock(&recall_lock); - list_for_each_safe(pos, next, &del_recall_lru) { + list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn) continue; @@ -3233,7 +3408,7 @@ nfs4_laundromat(struct nfsd_net *nn) spin_unlock(&recall_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - unhash_delegation(dp); + revoke_delegation(dp); } test_val = nn->nfsd4_lease; list_for_each_safe(pos, next, &nn->close_lru) { @@ -3276,16 +3451,6 @@ static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *s return nfs_ok; } -static int -STALE_STATEID(stateid_t *stateid, struct nfsd_net *nn) -{ - if (stateid->si_opaque.so_clid.cl_boot == nn->boot_time) - return 0; - dprintk("NFSD: stale stateid " STATEID_FMT "!\n", - STATEID_VAL(stateid)); - return 1; -} - static inline int access_permit_read(struct nfs4_ol_stateid *stp) { @@ -3348,7 +3513,7 @@ grace_disallows_io(struct net *net, struct inode *inode) /* Returns true iff a is later than b: */ static bool stateid_generation_after(stateid_t *a, stateid_t *b) { - return (s32)a->si_generation - (s32)b->si_generation > 0; + return (s32)(a->si_generation - b->si_generation) > 0; } static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) @@ -3402,13 +3567,24 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) status = check_stateid_generation(stateid, &s->sc_stateid, 1); if (status) return status; - if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID))) + switch (s->sc_type) { + case NFS4_DELEG_STID: + return nfs_ok; + case NFS4_REVOKED_DELEG_STID: + return nfserr_deleg_revoked; + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: + ols = openlockstateid(s); + if (ols->st_stateowner->so_is_open_owner + && !(openowner(ols->st_stateowner)->oo_flags + & NFS4_OO_CONFIRMED)) + return nfserr_bad_stateid; return nfs_ok; - ols = openlockstateid(s); - if (ols->st_stateowner->so_is_open_owner - && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + default: + printk("unknown stateid type %x\n", s->sc_type); + case NFS4_CLOSED_STID: return nfserr_bad_stateid; - return nfs_ok; + } } static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, @@ -3416,19 +3592,20 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfsd_net *nn) { struct nfs4_client *cl; + __be32 status; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return nfserr_bad_stateid; - if (STALE_STATEID(stateid, nn)) + status = lookup_clientid(&stateid->si_opaque.so_clid, sessions, + nn, &cl); + if (status == nfserr_stale_clientid) return nfserr_stale_stateid; - cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions, nn); - if (!cl) - return nfserr_expired; + if (status) + return status; *s = find_stateid_by_type(cl, stateid, typemask); if (!*s) return nfserr_bad_stateid; return nfs_ok; - } /* @@ -3538,6 +3715,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { stateid_t *stateid = &free_stateid->fr_stateid; struct nfs4_stid *s; + struct nfs4_delegation *dp; struct nfs4_client *cl = cstate->session->se_client; __be32 ret = nfserr_bad_stateid; @@ -3559,6 +3737,11 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, else ret = nfserr_locks_held; break; + case NFS4_REVOKED_DELEG_STID: + dp = delegstateid(s); + destroy_revoked_delegation(dp); + ret = nfs_ok; + break; default: ret = nfserr_bad_stateid; } @@ -3583,10 +3766,12 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ status = nfsd4_check_seqid(cstate, sop, seqid); if (status) return status; - if (stp->st_stid.sc_type == NFS4_CLOSED_STID) + if (stp->st_stid.sc_type == NFS4_CLOSED_STID + || stp->st_stid.sc_type == NFS4_REVOKED_DELEG_STID) /* * "Closed" stateid's exist *only* to return - * nfserr_replay_me from the previous step. + * nfserr_replay_me from the previous step, and + * revoked delegations are kept only for free_stateid. */ return nfserr_bad_stateid; status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); @@ -3616,7 +3801,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, if (status) return status; *stpp = openlockstateid(s); - cstate->replay_owner = (*stpp)->st_stateowner; + if (!nfsd4_has_session(cstate)) + cstate->replay_owner = (*stpp)->st_stateowner; return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp); } @@ -3674,6 +3860,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_client_record_create(oo->oo_owner.so_client); status = nfs_ok; out: + nfsd4_bump_seqid(cstate, status); if (!cstate->replay_owner) nfs4_unlock_state(); return status; @@ -3757,31 +3944,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); status = nfs_ok; out: + nfsd4_bump_seqid(cstate, status); if (!cstate->replay_owner) nfs4_unlock_state(); return status; } -void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so) -{ - struct nfs4_openowner *oo; - struct nfs4_ol_stateid *s; - - if (!so->so_is_open_owner) - return; - oo = openowner(so); - s = oo->oo_last_closed_stid; - if (!s) - return; - if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) { - /* Release the last_closed_stid on the next seqid bump: */ - oo->oo_flags |= NFS4_OO_PURGE_CLOSE; - return; - } - oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE; - release_last_closed_stateid(oo); -} - static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) { unhash_open_stateid(s); @@ -3810,28 +3978,30 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &close->cl_stateid, NFS4_OPEN_STID|NFS4_CLOSED_STID, &stp, nn); + nfsd4_bump_seqid(cstate, status); if (status) goto out; oo = openowner(stp->st_stateowner); - status = nfs_ok; update_stateid(&stp->st_stid.sc_stateid); memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); nfsd4_close_open_stateid(stp); - release_last_closed_stateid(oo); - oo->oo_last_closed_stid = stp; + + if (cstate->minorversion) { + unhash_stid(&stp->st_stid); + free_generic_stateid(stp); + } else + oo->oo_last_closed_stid = stp; if (list_empty(&oo->oo_owner.so_stateids)) { - if (cstate->minorversion) { + if (cstate->minorversion) release_openowner(oo); - cstate->replay_owner = NULL; - } else { + else { /* * In the 4.0 case we need to keep the owners around a * little while to handle CLOSE replay. */ - if (list_empty(&oo->oo_owner.so_stateids)) - move_to_close_lru(oo, SVC_NET(rqstp)); + move_to_close_lru(oo, SVC_NET(rqstp)); } } out: @@ -3863,7 +4033,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - unhash_delegation(dp); + destroy_delegation(dp); out: nfs4_unlock_state(); @@ -4241,6 +4411,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, out: if (status && new_state) release_lockowner(lock_sop); + nfsd4_bump_seqid(cstate, status); if (!cstate->replay_owner) nfs4_unlock_state(); if (file_lock) @@ -4395,21 +4566,16 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, locku->lu_length); nfs4_transform_lock_offset(file_lock); - /* - * Try to unlock the file in the VFS. - */ err = vfs_lock_file(filp, F_SETLK, file_lock, NULL); if (err) { dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); goto out_nfserr; } - /* - * OK, unlock succeeded; the only thing left to do is update the stateid. - */ update_stateid(&stp->st_stid.sc_stateid); memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); out: + nfsd4_bump_seqid(cstate, status); if (!cstate->replay_owner) nfs4_unlock_state(); if (file_lock) @@ -4433,7 +4599,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner) struct inode *inode = filp->fi_inode; int status = 0; - lock_flocks(); + spin_lock(&inode->i_lock); for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { if ((*flpp)->fl_owner == (fl_owner_t)lowner) { status = 1; @@ -4441,7 +4607,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner) } } out: - unlock_flocks(); + spin_unlock(&inode->i_lock); return status; } @@ -4602,6 +4768,8 @@ nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn) u64 nfsd_forget_client(struct nfs4_client *clp, u64 max) { + if (mark_client_expired(clp)) + return 0; expire_client(clp); return 1; } @@ -4708,7 +4876,7 @@ u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max) spin_unlock(&recall_lock); list_for_each_entry_safe(dp, next, &victims, dl_recall_lru) - unhash_delegation(dp); + revoke_delegation(dp); return count; } @@ -4780,12 +4948,6 @@ struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_ void nfs4_state_init(void) { - int i; - - for (i = 0; i < FILE_HASH_SIZE; i++) { - INIT_LIST_HEAD(&file_hashtbl[i]); - } - INIT_LIST_HEAD(&del_recall_lru); } /* @@ -4849,6 +5011,7 @@ static int nfs4_state_create_net(struct net *net) nn->unconf_name_tree = RB_ROOT; INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); + INIT_LIST_HEAD(&nn->del_recall_lru); spin_lock_init(&nn->client_lock); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); @@ -4961,16 +5124,14 @@ nfs4_state_shutdown_net(struct net *net) INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); - list_for_each_safe(pos, next, &del_recall_lru) { + list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - if (dp->dl_stid.sc_client->net != net) - continue; list_move(&dp->dl_recall_lru, &reaplist); } spin_unlock(&recall_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - unhash_delegation(dp); + destroy_delegation(dp); } nfsd4_client_tracking_exit(net); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index a2720071f282..0c0f3ea90de5 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -55,6 +55,11 @@ #include "cache.h" #include "netns.h" +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +#include <linux/security.h> +#endif + + #define NFSDDBG_FACILITY NFSDDBG_XDR /* @@ -134,6 +139,19 @@ xdr_error: \ } \ } while (0) +static void next_decode_page(struct nfsd4_compoundargs *argp) +{ + argp->pagelist++; + argp->p = page_address(argp->pagelist[0]); + if (argp->pagelen < PAGE_SIZE) { + argp->end = argp->p + (argp->pagelen>>2); + argp->pagelen = 0; + } else { + argp->end = argp->p + (PAGE_SIZE>>2); + argp->pagelen -= PAGE_SIZE; + } +} + static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) { /* We want more bytes than seem to be available. @@ -161,16 +179,7 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) * guarantee p points to at least nbytes bytes. */ memcpy(p, argp->p, avail); - /* step to next page */ - argp->p = page_address(argp->pagelist[0]); - argp->pagelist++; - if (argp->pagelen < PAGE_SIZE) { - argp->end = argp->p + (argp->pagelen>>2); - argp->pagelen = 0; - } else { - argp->end = argp->p + (PAGE_SIZE>>2); - argp->pagelen -= PAGE_SIZE; - } + next_decode_page(argp); memcpy(((char*)p)+avail, argp->p, (nbytes - avail)); argp->p += XDR_QUADLEN(nbytes - avail); return p; @@ -242,7 +251,8 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) static __be32 nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, - struct iattr *iattr, struct nfs4_acl **acl) + struct iattr *iattr, struct nfs4_acl **acl, + struct xdr_netobj *label) { int expected_len, len = 0; u32 dummy32; @@ -344,10 +354,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, all 32 bits of 'nseconds'. */ READ_BUF(12); len += 12; - READ32(dummy32); - if (dummy32) - return nfserr_inval; - READ32(iattr->ia_atime.tv_sec); + READ64(iattr->ia_atime.tv_sec); READ32(iattr->ia_atime.tv_nsec); if (iattr->ia_atime.tv_nsec >= (u32)1000000000) return nfserr_inval; @@ -370,10 +377,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, all 32 bits of 'nseconds'. */ READ_BUF(12); len += 12; - READ32(dummy32); - if (dummy32) - return nfserr_inval; - READ32(iattr->ia_mtime.tv_sec); + READ64(iattr->ia_mtime.tv_sec); READ32(iattr->ia_mtime.tv_nsec); if (iattr->ia_mtime.tv_nsec >= (u32)1000000000) return nfserr_inval; @@ -386,6 +390,32 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, goto xdr_error; } } + + label->len = 0; +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { + READ_BUF(4); + len += 4; + READ32(dummy32); /* lfs: we don't use it */ + READ_BUF(4); + len += 4; + READ32(dummy32); /* pi: we don't use it either */ + READ_BUF(4); + len += 4; + READ32(dummy32); + READ_BUF(dummy32); + if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN) + return nfserr_badlabel; + len += (XDR_QUADLEN(dummy32) << 2); + READMEM(buf, dummy32); + label->data = kzalloc(dummy32 + 1, GFP_KERNEL); + if (!label->data) + return nfserr_jukebox; + defer_free(argp, kfree, label->data); + memcpy(label->data, buf, dummy32); + } +#endif + if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0 || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1 || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2) @@ -434,7 +464,11 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ /* callback_sec_params4 */ READ_BUF(4); READ32(nr_secflavs); - cbs->flavor = (u32)(-1); + if (nr_secflavs) + cbs->flavor = (u32)(-1); + else + /* Is this legal? Be generous, take it to mean AUTH_NONE: */ + cbs->flavor = 0; for (i = 0; i < nr_secflavs; ++i) { READ_BUF(4); READ32(dummy); @@ -582,7 +616,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create return status; status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, - &create->cr_acl); + &create->cr_acl, &create->cr_label); if (status) goto out; @@ -804,6 +838,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) open->op_iattr.ia_valid = 0; open->op_openowner = NULL; + open->op_xdr_error = 0; /* seqid, share_access, share_deny, clientid, ownerlen */ READ_BUF(4); READ32(open->op_seqid); @@ -832,7 +867,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) case NFS4_CREATE_UNCHECKED: case NFS4_CREATE_GUARDED: status = nfsd4_decode_fattr(argp, open->op_bmval, - &open->op_iattr, &open->op_acl); + &open->op_iattr, &open->op_acl, &open->op_label); if (status) goto out; break; @@ -846,7 +881,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) READ_BUF(NFS4_VERIFIER_SIZE); COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); status = nfsd4_decode_fattr(argp, open->op_bmval, - &open->op_iattr, &open->op_acl); + &open->op_iattr, &open->op_acl, &open->op_label); if (status) goto out; break; @@ -1068,7 +1103,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta if (status) return status; return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, - &setattr->sa_acl); + &setattr->sa_acl, &setattr->sa_label); } static __be32 @@ -1572,6 +1607,7 @@ struct nfsd4_minorversion_ops { static struct nfsd4_minorversion_ops nfsd4_minorversion[] = { [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) }, [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) }, + [2] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) }, }; static __be32 @@ -1692,36 +1728,6 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c) } while (0) #define ADJUST_ARGS() resp->p = p -/* - * Header routine to setup seqid operation replay cache - */ -#define ENCODE_SEQID_OP_HEAD \ - __be32 *save; \ - \ - save = resp->p; - -/* - * Routine for encoding the result of a "seqid-mutating" NFSv4 operation. This - * is where sequence id's are incremented, and the replay cache is filled. - * Note that we increment sequence id's here, at the last moment, so we're sure - * we know whether the error to be returned is a sequence id mutating error. - */ - -static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr) -{ - struct nfs4_stateowner *stateowner = resp->cstate.replay_owner; - - if (seqid_mutating_err(ntohl(nfserr)) && stateowner) { - stateowner->so_seqid++; - stateowner->so_replay.rp_status = nfserr; - stateowner->so_replay.rp_buflen = - (char *)resp->p - (char *)save; - memcpy(stateowner->so_replay.rp_buf, save, - stateowner->so_replay.rp_buflen); - nfsd4_purge_closed_stateid(stateowner); - } -} - /* Encode as an array of strings the string given with components * separated @sep, escaped with esc_enter and esc_exit. */ @@ -1988,6 +1994,36 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace, FATTR4_WORD0_RDATTR_ERROR) #define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +static inline __be32 +nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen) +{ + __be32 *p = *pp; + + if (*buflen < ((XDR_QUADLEN(len) << 2) + 4 + 4 + 4)) + return nfserr_resource; + + /* + * For now we use a 0 here to indicate the null translation; in + * the future we may place a call to translation code here. + */ + if ((*buflen -= 8) < 0) + return nfserr_resource; + + WRITE32(0); /* lfs */ + WRITE32(0); /* pi */ + p = xdr_encode_opaque(p, context, len); + *buflen -= (XDR_QUADLEN(len) << 2) + 4; + + *pp = p; + return 0; +} +#else +static inline __be32 +nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen) +{ return 0; } +#endif + static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) { /* As per referral draft: */ @@ -2047,6 +2083,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, int err; int aclsupport = 0; struct nfs4_acl *acl = NULL; + void *context = NULL; + int contextlen; + bool contextsupport = false; struct nfsd4_compoundres *resp = rqstp->rq_resp; u32 minorversion = resp->cstate.minorversion; struct path path = { @@ -2100,6 +2139,21 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, } } +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) || + bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { + err = security_inode_getsecctx(dentry->d_inode, + &context, &contextlen); + contextsupport = (err == 0); + if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { + if (err == -EOPNOTSUPP) + bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL; + else if (err) + goto out_nfserr; + } + } +#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ + if (bmval2) { if ((buflen -= 16) < 0) goto out_resource; @@ -2128,6 +2182,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if (!aclsupport) word0 &= ~FATTR4_WORD0_ACL; + if (!contextsupport) + word2 &= ~FATTR4_WORD2_SECURITY_LABEL; if (!word2) { if ((buflen -= 12) < 0) goto out_resource; @@ -2401,8 +2457,7 @@ out_acl: if (bmval1 & FATTR4_WORD1_TIME_ACCESS) { if ((buflen -= 12) < 0) goto out_resource; - WRITE32(0); - WRITE32(stat.atime.tv_sec); + WRITE64((s64)stat.atime.tv_sec); WRITE32(stat.atime.tv_nsec); } if (bmval1 & FATTR4_WORD1_TIME_DELTA) { @@ -2415,15 +2470,13 @@ out_acl: if (bmval1 & FATTR4_WORD1_TIME_METADATA) { if ((buflen -= 12) < 0) goto out_resource; - WRITE32(0); - WRITE32(stat.ctime.tv_sec); + WRITE64((s64)stat.ctime.tv_sec); WRITE32(stat.ctime.tv_nsec); } if (bmval1 & FATTR4_WORD1_TIME_MODIFY) { if ((buflen -= 12) < 0) goto out_resource; - WRITE32(0); - WRITE32(stat.mtime.tv_sec); + WRITE64((s64)stat.mtime.tv_sec); WRITE32(stat.mtime.tv_nsec); } if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { @@ -2438,6 +2491,12 @@ out_acl: get_parent_attributes(exp, &stat); WRITE64(stat.ino); } + if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { + status = nfsd4_encode_security_label(rqstp, context, + contextlen, &p, &buflen); + if (status) + goto out; + } if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { WRITE32(3); WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); @@ -2450,6 +2509,10 @@ out_acl: status = nfs_ok; out: +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + if (context) + security_release_secctx(context, contextlen); +#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ kfree(acl); if (fhp == &tempfh) fh_put(&tempfh); @@ -2661,12 +2724,9 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, static __be32 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) { - ENCODE_SEQID_OP_HEAD; - if (!nfserr) nfsd4_encode_stateid(resp, &close->cl_stateid); - encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2762,14 +2822,11 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie static __be32 nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) { - ENCODE_SEQID_OP_HEAD; - if (!nfserr) nfsd4_encode_stateid(resp, &lock->lk_resp_stateid); else if (nfserr == nfserr_denied) nfsd4_encode_lock_denied(resp, &lock->lk_denied); - encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2784,12 +2841,9 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l static __be32 nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) { - ENCODE_SEQID_OP_HEAD; - if (!nfserr) nfsd4_encode_stateid(resp, &locku->lu_stateid); - encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2812,7 +2866,6 @@ static __be32 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) { __be32 *p; - ENCODE_SEQID_OP_HEAD; if (nfserr) goto out; @@ -2884,31 +2937,24 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op } /* XXX save filehandle here */ out: - encode_seqid_op_tail(resp, save, nfserr); return nfserr; } static __be32 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) { - ENCODE_SEQID_OP_HEAD; - if (!nfserr) nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); - encode_seqid_op_tail(resp, save, nfserr); return nfserr; } static __be32 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) { - ENCODE_SEQID_OP_HEAD; - if (!nfserr) nfsd4_encode_stateid(resp, &od->od_stateid); - encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -3138,13 +3184,13 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ static __be32 nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, - __be32 nfserr,struct svc_export *exp) + __be32 nfserr, struct svc_export *exp) { - int i = 0; - u32 nflavs; + u32 i, nflavs, supported; struct exp_flavor_info *flavs; struct exp_flavor_info def_flavs[2]; - __be32 *p; + __be32 *p, *flavorsp; + static bool report = true; if (nfserr) goto out; @@ -3168,34 +3214,40 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, } } + supported = 0; RESERVE_SPACE(4); - WRITE32(nflavs); + flavorsp = p++; /* to be backfilled later */ ADJUST_ARGS(); + for (i = 0; i < nflavs; i++) { - u32 flav = flavs[i].pseudoflavor; - struct gss_api_mech *gm = gss_mech_get_by_pseudoflavor(flav); + rpc_authflavor_t pf = flavs[i].pseudoflavor; + struct rpcsec_gss_info info; - if (gm) { - RESERVE_SPACE(4); + if (rpcauth_get_gssinfo(pf, &info) == 0) { + supported++; + RESERVE_SPACE(4 + 4 + info.oid.len + 4 + 4); WRITE32(RPC_AUTH_GSS); + WRITE32(info.oid.len); + WRITEMEM(info.oid.data, info.oid.len); + WRITE32(info.qop); + WRITE32(info.service); ADJUST_ARGS(); - RESERVE_SPACE(4 + gm->gm_oid.len); - WRITE32(gm->gm_oid.len); - WRITEMEM(gm->gm_oid.data, gm->gm_oid.len); - ADJUST_ARGS(); + } else if (pf < RPC_AUTH_MAXFLAVOR) { + supported++; RESERVE_SPACE(4); - WRITE32(0); /* qop */ + WRITE32(pf); ADJUST_ARGS(); - RESERVE_SPACE(4); - WRITE32(gss_pseudoflavor_to_service(gm, flav)); - ADJUST_ARGS(); - gss_mech_put(gm); } else { - RESERVE_SPACE(4); - WRITE32(flav); - ADJUST_ARGS(); + if (report) + pr_warn("NFS: SECINFO: security flavor %u " + "is not supported\n", pf); } } + + if (nflavs != supported) + report = false; + *flavorsp = htonl(supported); + out: if (exp) exp_put(exp); @@ -3225,16 +3277,18 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 { __be32 *p; - RESERVE_SPACE(12); + RESERVE_SPACE(16); if (nfserr) { - WRITE32(2); + WRITE32(3); + WRITE32(0); WRITE32(0); WRITE32(0); } else { - WRITE32(2); + WRITE32(3); WRITE32(setattr->sa_bmval[0]); WRITE32(setattr->sa_bmval[1]); + WRITE32(setattr->sa_bmval[2]); } ADJUST_ARGS(); return nfserr; @@ -3275,6 +3329,14 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w return nfserr; } +static const u32 nfs4_minimal_spo_must_enforce[2] = { + [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) | + 1 << (OP_EXCHANGE_ID - 32) | + 1 << (OP_CREATE_SESSION - 32) | + 1 << (OP_DESTROY_SESSION - 32) | + 1 << (OP_DESTROY_CLIENTID - 32) +}; + static __be32 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_exchange_id *exid) @@ -3313,6 +3375,20 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, /* state_protect4_r. Currently only support SP4_NONE */ BUG_ON(exid->spa_how != SP4_NONE); WRITE32(exid->spa_how); + switch (exid->spa_how) { + case SP4_NONE: + break; + case SP4_MACH_CRED: + /* spo_must_enforce bitmap: */ + WRITE32(2); + WRITE32(nfs4_minimal_spo_must_enforce[0]); + WRITE32(nfs4_minimal_spo_must_enforce[1]); + /* empty spo_must_allow bitmap: */ + WRITE32(0); + break; + default: + WARN_ON_ONCE(1); + } /* The server_owner struct */ WRITE64(minor_id); /* Minor id */ @@ -3566,6 +3642,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad) void nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) { + struct nfs4_stateowner *so = resp->cstate.replay_owner; __be32 *statp; __be32 *p; @@ -3582,6 +3659,11 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) /* nfsd4_check_drc_limit guarantees enough room for error status */ if (!op->status) op->status = nfsd4_check_resp_size(resp, 0); + if (so) { + so->so_replay.rp_status = op->status; + so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1); + memcpy(so->so_replay.rp_buf, statp+1, so->so_replay.rp_buflen); + } status: /* * Note: We write the status directly, instead of using WRITE32(), @@ -3678,13 +3760,17 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; BUG_ON(iov->iov_len > PAGE_SIZE); if (nfsd4_has_session(cs)) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfs4_client *clp = cs->session->se_client; if (cs->status != nfserr_replay_cache) { nfsd4_store_cache_entry(resp); cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE; } /* Renew the clientid on success and on replay */ - release_session_client(cs->session); + spin_lock(&nn->client_lock); nfsd4_put_session(cs->session); + spin_unlock(&nn->client_lock); + put_client_renew(clp); } return 1; } diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index ca05f6dc3544..e76244edd748 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -11,6 +11,8 @@ #include <linux/slab.h> #include <linux/sunrpc/addr.h> #include <linux/highmem.h> +#include <linux/log2.h> +#include <linux/hash.h> #include <net/checksum.h> #include "nfsd.h" @@ -18,30 +20,49 @@ #define NFSDDBG_FACILITY NFSDDBG_REPCACHE -#define HASHSIZE 64 +/* + * We use this value to determine the number of hash buckets from the max + * cache size, the idea being that when the cache is at its maximum number + * of entries, then this should be the average number of entries per bucket. + */ +#define TARGET_BUCKET_SIZE 64 static struct hlist_head * cache_hash; static struct list_head lru_head; static struct kmem_cache *drc_slab; -static unsigned int num_drc_entries; + +/* max number of entries allowed in the cache */ static unsigned int max_drc_entries; +/* number of significant bits in the hash value */ +static unsigned int maskbits; + /* - * Calculate the hash index from an XID. + * Stats and other tracking of on the duplicate reply cache. All of these and + * the "rc" fields in nfsdstats are protected by the cache_lock */ -static inline u32 request_hash(u32 xid) -{ - u32 h = xid; - h ^= (xid >> 24); - return h & (HASHSIZE-1); -} + +/* total number of entries */ +static unsigned int num_drc_entries; + +/* cache misses due only to checksum comparison failures */ +static unsigned int payload_misses; + +/* amount of memory (in bytes) currently consumed by the DRC */ +static unsigned int drc_mem_usage; + +/* longest hash chain seen */ +static unsigned int longest_chain; + +/* size of cache when we saw the longest hash chain */ +static unsigned int longest_chain_cachesize; static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); static void cache_cleaner_func(struct work_struct *unused); static int nfsd_reply_cache_shrink(struct shrinker *shrink, struct shrink_control *sc); -struct shrinker nfsd_reply_cache_shrinker = { +static struct shrinker nfsd_reply_cache_shrinker = { .shrink = nfsd_reply_cache_shrink, .seeks = 1, }; @@ -82,6 +103,16 @@ nfsd_cache_size_limit(void) return min_t(unsigned int, limit, 256*1024); } +/* + * Compute the number of hash buckets we need. Divide the max cachesize by + * the "target" max bucket size, and round up to next power of two. + */ +static unsigned int +nfsd_hashsize(unsigned int limit) +{ + return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); +} + static struct svc_cacherep * nfsd_reply_cache_alloc(void) { @@ -100,12 +131,15 @@ nfsd_reply_cache_alloc(void) static void nfsd_reply_cache_free_locked(struct svc_cacherep *rp) { - if (rp->c_type == RC_REPLBUFF) + if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { + drc_mem_usage -= rp->c_replvec.iov_len; kfree(rp->c_replvec.iov_base); + } if (!hlist_unhashed(&rp->c_hash)) hlist_del(&rp->c_hash); list_del(&rp->c_lru); --num_drc_entries; + drc_mem_usage -= sizeof(*rp); kmem_cache_free(drc_slab, rp); } @@ -119,9 +153,13 @@ nfsd_reply_cache_free(struct svc_cacherep *rp) int nfsd_reply_cache_init(void) { + unsigned int hashsize; + INIT_LIST_HEAD(&lru_head); max_drc_entries = nfsd_cache_size_limit(); num_drc_entries = 0; + hashsize = nfsd_hashsize(max_drc_entries); + maskbits = ilog2(hashsize); register_shrinker(&nfsd_reply_cache_shrinker); drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep), @@ -129,7 +167,7 @@ int nfsd_reply_cache_init(void) if (!drc_slab) goto out_nomem; - cache_hash = kcalloc(HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL); + cache_hash = kcalloc(hashsize, sizeof(struct hlist_head), GFP_KERNEL); if (!cache_hash) goto out_nomem; @@ -180,7 +218,7 @@ static void hash_refile(struct svc_cacherep *rp) { hlist_del_init(&rp->c_hash); - hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid)); + hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits)); } static inline bool @@ -273,6 +311,26 @@ nfsd_cache_csum(struct svc_rqst *rqstp) return csum; } +static bool +nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp) +{ + /* Check RPC header info first */ + if (rqstp->rq_xid != rp->c_xid || rqstp->rq_proc != rp->c_proc || + rqstp->rq_prot != rp->c_prot || rqstp->rq_vers != rp->c_vers || + rqstp->rq_arg.len != rp->c_len || + !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) || + rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr)) + return false; + + /* compare checksum of NFS data */ + if (csum != rp->c_csum) { + ++payload_misses; + return false; + } + + return true; +} + /* * Search the request hash for an entry that matches the given rqstp. * Must be called with cache_lock held. Returns the found entry or @@ -281,23 +339,30 @@ nfsd_cache_csum(struct svc_rqst *rqstp) static struct svc_cacherep * nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum) { - struct svc_cacherep *rp; + struct svc_cacherep *rp, *ret = NULL; struct hlist_head *rh; - __be32 xid = rqstp->rq_xid; - u32 proto = rqstp->rq_prot, - vers = rqstp->rq_vers, - proc = rqstp->rq_proc; + unsigned int entries = 0; - rh = &cache_hash[request_hash(xid)]; + rh = &cache_hash[hash_32(rqstp->rq_xid, maskbits)]; hlist_for_each_entry(rp, rh, c_hash) { - if (xid == rp->c_xid && proc == rp->c_proc && - proto == rp->c_prot && vers == rp->c_vers && - rqstp->rq_arg.len == rp->c_len && csum == rp->c_csum && - rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) && - rpc_get_port(svc_addr(rqstp)) == rpc_get_port((struct sockaddr *)&rp->c_addr)) - return rp; + ++entries; + if (nfsd_cache_match(rqstp, csum, rp)) { + ret = rp; + break; + } } - return NULL; + + /* tally hash chain length stats */ + if (entries > longest_chain) { + longest_chain = entries; + longest_chain_cachesize = num_drc_entries; + } else if (entries == longest_chain) { + /* prefer to keep the smallest cachesize possible here */ + longest_chain_cachesize = min(longest_chain_cachesize, + num_drc_entries); + } + + return ret; } /* @@ -318,55 +383,55 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) __wsum csum; unsigned long age; int type = rqstp->rq_cachetype; - int rtn; + int rtn = RC_DOIT; rqstp->rq_cacherep = NULL; if (type == RC_NOCACHE) { nfsdstats.rcnocache++; - return RC_DOIT; + return rtn; } csum = nfsd_cache_csum(rqstp); + /* + * Since the common case is a cache miss followed by an insert, + * preallocate an entry. First, try to reuse the first entry on the LRU + * if it works, then go ahead and prune the LRU list. + */ spin_lock(&cache_lock); - rtn = RC_DOIT; - - rp = nfsd_cache_search(rqstp, csum); - if (rp) - goto found_entry; - - /* Try to use the first entry on the LRU */ if (!list_empty(&lru_head)) { rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru); if (nfsd_cache_entry_expired(rp) || num_drc_entries >= max_drc_entries) { lru_put_end(rp); prune_cache_entries(); - goto setup_entry; + goto search_cache; } } - /* Drop the lock and allocate a new entry */ + /* No expired ones available, allocate a new one. */ spin_unlock(&cache_lock); rp = nfsd_reply_cache_alloc(); - if (!rp) { - dprintk("nfsd: unable to allocate DRC entry!\n"); - return RC_DOIT; - } spin_lock(&cache_lock); - ++num_drc_entries; + if (likely(rp)) { + ++num_drc_entries; + drc_mem_usage += sizeof(*rp); + } - /* - * Must search again just in case someone inserted one - * after we dropped the lock above. - */ +search_cache: found = nfsd_cache_search(rqstp, csum); if (found) { - nfsd_reply_cache_free_locked(rp); + if (likely(rp)) + nfsd_reply_cache_free_locked(rp); rp = found; goto found_entry; } + if (!rp) { + dprintk("nfsd: unable to allocate DRC entry!\n"); + goto out; + } + /* * We're keeping the one we just allocated. Are we now over the * limit? Prune one off the tip of the LRU in trade for the one we @@ -376,7 +441,6 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) nfsd_reply_cache_free_locked(list_first_entry(&lru_head, struct svc_cacherep, c_lru)); -setup_entry: nfsdstats.rcmisses++; rqstp->rq_cacherep = rp; rp->c_state = RC_INPROG; @@ -394,6 +458,7 @@ setup_entry: /* release any buffer */ if (rp->c_type == RC_REPLBUFF) { + drc_mem_usage -= rp->c_replvec.iov_len; kfree(rp->c_replvec.iov_base); rp->c_replvec.iov_base = NULL; } @@ -462,6 +527,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) struct svc_cacherep *rp = rqstp->rq_cacherep; struct kvec *resv = &rqstp->rq_res.head[0], *cachv; int len; + size_t bufsize = 0; if (!rp) return; @@ -483,19 +549,21 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) break; case RC_REPLBUFF: cachv = &rp->c_replvec; - cachv->iov_base = kmalloc(len << 2, GFP_KERNEL); + bufsize = len << 2; + cachv->iov_base = kmalloc(bufsize, GFP_KERNEL); if (!cachv->iov_base) { nfsd_reply_cache_free(rp); return; } - cachv->iov_len = len << 2; - memcpy(cachv->iov_base, statp, len << 2); + cachv->iov_len = bufsize; + memcpy(cachv->iov_base, statp, bufsize); break; case RC_NOCACHE: nfsd_reply_cache_free(rp); return; } spin_lock(&cache_lock); + drc_mem_usage += bufsize; lru_put_end(rp); rp->c_secure = rqstp->rq_secure; rp->c_type = cachetype; @@ -523,3 +591,30 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) vec->iov_len += data->iov_len; return 1; } + +/* + * Note that fields may be added, removed or reordered in the future. Programs + * scraping this file for info should test the labels to ensure they're + * getting the correct field. + */ +static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) +{ + spin_lock(&cache_lock); + seq_printf(m, "max entries: %u\n", max_drc_entries); + seq_printf(m, "num entries: %u\n", num_drc_entries); + seq_printf(m, "hash buckets: %u\n", 1 << maskbits); + seq_printf(m, "mem usage: %u\n", drc_mem_usage); + seq_printf(m, "cache hits: %u\n", nfsdstats.rchits); + seq_printf(m, "cache misses: %u\n", nfsdstats.rcmisses); + seq_printf(m, "not cached: %u\n", nfsdstats.rcnocache); + seq_printf(m, "payload misses: %u\n", payload_misses); + seq_printf(m, "longest chain len: %u\n", longest_chain); + seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize); + spin_unlock(&cache_lock); + return 0; +} + +int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, nfsd_reply_cache_stats_show, NULL); +} diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f33455b4d957..7f555179bf81 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -35,6 +35,7 @@ enum { NFSD_Threads, NFSD_Pool_Threads, NFSD_Pool_Stats, + NFSD_Reply_Cache_Stats, NFSD_Versions, NFSD_Ports, NFSD_MaxBlkSize, @@ -177,7 +178,7 @@ static int export_features_open(struct inode *inode, struct file *file) return single_open(file, export_features_show, NULL); } -static struct file_operations export_features_operations = { +static const struct file_operations export_features_operations = { .open = export_features_open, .read = seq_read, .llseek = seq_lseek, @@ -196,7 +197,7 @@ static int supported_enctypes_open(struct inode *inode, struct file *file) return single_open(file, supported_enctypes_show, NULL); } -static struct file_operations supported_enctypes_ops = { +static const struct file_operations supported_enctypes_ops = { .open = supported_enctypes_open, .read = seq_read, .llseek = seq_lseek, @@ -212,6 +213,13 @@ static const struct file_operations pool_stats_operations = { .owner = THIS_MODULE, }; +static struct file_operations reply_cache_stats_operations = { + .open = nfsd_reply_cache_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /*----------------------------------------------------------------------------*/ /* * payload - write methods @@ -1047,6 +1055,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO}, + [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO}, [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, @@ -1102,8 +1111,10 @@ static int create_proc_exports_entry(void) return -ENOMEM; entry = proc_create("exports", 0, entry, &exports_proc_operations); - if (!entry) + if (!entry) { + remove_proc_entry("fs/nfs", NULL); return -ENOMEM; + } return 0; } #else /* CONFIG_PROC_FS */ diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 07a473fd49bc..2bbd94e51efc 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -24,7 +24,7 @@ /* * nfsd version */ -#define NFSD_SUPPORTED_MINOR_VERSION 1 +#define NFSD_SUPPORTED_MINOR_VERSION 2 /* * Maximum blocksizes supported by daemon under various circumstances. */ @@ -243,6 +243,12 @@ void nfsd_lockd_shutdown(void); #define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG) #define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT) #define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED) +#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP) +#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH) +#define nfserr_metadata_notsupp cpu_to_be32(NFS4ERR_METADATA_NOTSUPP) +#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED) +#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS) +#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL) /* error codes for internal use */ /* if a request fails due to kmalloc failure, it gets dropped. @@ -322,6 +328,13 @@ void nfsd_lockd_shutdown(void); #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ + (NFSD4_1_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SECURITY_LABEL) +#else +#define NFSD4_2_SUPPORTED_ATTRS_WORD2 0 +#endif + static inline u32 nfsd_suppattrs0(u32 minorversion) { return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0 @@ -336,8 +349,11 @@ static inline u32 nfsd_suppattrs1(u32 minorversion) static inline u32 nfsd_suppattrs2(u32 minorversion) { - return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2 - : NFSD4_SUPPORTED_ATTRS_WORD2; + switch (minorversion) { + default: return NFSD4_2_SUPPORTED_ATTRS_WORD2; + case 1: return NFSD4_1_SUPPORTED_ATTRS_WORD2; + case 0: return NFSD4_SUPPORTED_ATTRS_WORD2; + } } /* These will return ERR_INVAL if specified in GETATTR or READDIR. */ @@ -350,7 +366,11 @@ static inline u32 nfsd_suppattrs2(u32 minorversion) #define NFSD_WRITEABLE_ATTRS_WORD1 \ (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +#define NFSD_WRITEABLE_ATTRS_WORD2 FATTR4_WORD2_SECURITY_LABEL +#else #define NFSD_WRITEABLE_ATTRS_WORD2 0 +#endif #define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ NFSD_WRITEABLE_ATTRS_WORD0 diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 262df5ccbf59..6b9f48ca4c25 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -116,7 +116,7 @@ struct svc_program nfsd_program = { }; -u32 nfsd_supported_minorversion; +u32 nfsd_supported_minorversion = 1; int nfsd_vers(int vers, enum vers_op change) { diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 1a8c7391f7ae..424d8f5f2317 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -79,6 +79,8 @@ struct nfs4_stid { #define NFS4_DELEG_STID 4 /* For an open stateid kept around *only* to process close replays: */ #define NFS4_CLOSED_STID 8 +/* For a deleg stateid kept around only to process free_stateid's: */ +#define NFS4_REVOKED_DELEG_STID 16 unsigned char sc_type; stateid_t sc_stateid; struct nfs4_client *sc_client; @@ -194,9 +196,11 @@ struct nfsd4_conn { }; struct nfsd4_session { - struct kref se_ref; + atomic_t se_ref; struct list_head se_hash; /* hash by sessionid */ struct list_head se_perclnt; +/* See SESSION4_PERSIST, etc. for standard flags; this is internal-only: */ +#define NFS4_SESSION_DEAD 0x010 u32 se_flags; struct nfs4_client *se_client; struct nfs4_sessionid se_sessionid; @@ -236,11 +240,13 @@ struct nfs4_client { struct list_head cl_openowners; struct idr cl_stateids; /* stateid lookup */ struct list_head cl_delegations; + struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */ struct list_head cl_lru; /* tail queue */ struct xdr_netobj cl_name; /* id generated by client */ nfs4_verifier cl_verifier; /* generated by client */ time_t cl_time; /* time of last lease renewal */ struct sockaddr_storage cl_addr; /* client ipaddress */ + bool cl_mach_cred; /* SP4_MACH_CRED in force */ struct svc_cred cl_cred; /* setclientid principal */ clientid_t cl_clientid; /* generated by server */ nfs4_verifier cl_confirm; /* generated by server */ @@ -286,18 +292,6 @@ struct nfs4_client { struct net *net; }; -static inline void -mark_client_expired(struct nfs4_client *clp) -{ - clp->cl_time = 0; -} - -static inline bool -is_client_expired(struct nfs4_client *clp) -{ - return clp->cl_time == 0; -} - /* struct nfs4_client_reset * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl * upon lease reset, or from upcall to state_daemon (to read in state @@ -365,7 +359,6 @@ struct nfs4_openowner { struct nfs4_ol_stateid *oo_last_closed_stid; time_t oo_time; /* time of placement on so_close_lru */ #define NFS4_OO_CONFIRMED 1 -#define NFS4_OO_PURGE_CLOSE 2 #define NFS4_OO_NEW 4 unsigned char oo_flags; }; @@ -373,7 +366,7 @@ struct nfs4_openowner { struct nfs4_lockowner { struct nfs4_stateowner lo_owner; /* must be first element */ struct list_head lo_owner_ino_hash; /* hash by owner,file */ - struct list_head lo_perstateid; /* for lockowners only */ + struct list_head lo_perstateid; struct list_head lo_list; /* for temporary uses */ }; @@ -390,7 +383,7 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) /* nfs4_file: a file opened by some number of (open) nfs4_stateowners. */ struct nfs4_file { atomic_t fi_ref; - struct list_head fi_hash; /* hash by "struct inode *" */ + struct hlist_node fi_hash; /* hash by "struct inode *" */ struct list_head fi_stateids; struct list_head fi_delegations; /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ @@ -486,8 +479,7 @@ extern void nfs4_put_delegation(struct nfs4_delegation *dp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); -extern void release_session_client(struct nfsd4_session *); -extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); +extern void put_client_renew(struct nfs4_client *clp); /* nfs4recover operations */ extern int nfsd4_client_tracking_init(struct net *net); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2b2e2396a869..8ff6a0019b0b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -28,6 +28,7 @@ #include <asm/uaccess.h> #include <linux/exportfs.h> #include <linux/writeback.h> +#include <linux/security.h> #ifdef CONFIG_NFSD_V3 #include "xdr3.h" @@ -621,6 +622,33 @@ int nfsd4_is_junction(struct dentry *dentry) return 0; return 1; } +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct xdr_netobj *label) +{ + __be32 error; + int host_error; + struct dentry *dentry; + + error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR); + if (error) + return error; + + dentry = fhp->fh_dentry; + + mutex_lock(&dentry->d_inode->i_mutex); + host_error = security_inode_setsecctx(dentry, label->data, label->len); + mutex_unlock(&dentry->d_inode->i_mutex); + return nfserrno(host_error); +} +#else +__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct xdr_netobj *label) +{ + return nfserr_notsupp; +} +#endif + #endif /* defined(CONFIG_NFSD_V4) */ #ifdef CONFIG_NFSD_V3 @@ -1758,10 +1786,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, tdentry = tfhp->fh_dentry; tdir = tdentry->d_inode; - err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev; - if (ffhp->fh_export != tfhp->fh_export) - goto out; - err = nfserr_perm; if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) goto out; @@ -1802,6 +1826,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, host_err = -EXDEV; if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) goto out_dput_new; + if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) + goto out_dput_new; host_err = nfsd_break_lease(odentry->d_inode); if (host_err) @@ -1914,6 +1940,7 @@ struct buffered_dirent { }; struct readdir_data { + struct dir_context ctx; char *dirent; size_t used; int full; @@ -1945,13 +1972,15 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen, static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func, struct readdir_cd *cdp, loff_t *offsetp) { - struct readdir_data buf; struct buffered_dirent *de; int host_err; int size; loff_t offset; + struct readdir_data buf = { + .ctx.actor = nfsd_buffered_filldir, + .dirent = (void *)__get_free_page(GFP_KERNEL) + }; - buf.dirent = (void *)__get_free_page(GFP_KERNEL); if (!buf.dirent) return nfserrno(-ENOMEM); @@ -1965,7 +1994,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func, buf.used = 0; buf.full = 0; - host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf); + host_err = iterate_dir(file, &buf.ctx); if (buf.full) host_err = 0; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 5b5894159f22..a4be2e389670 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -39,7 +39,6 @@ typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int); /* nfsd/vfs.c */ -int fh_lock_parent(struct svc_fh *, struct dentry *); int nfsd_racache_init(int); void nfsd_racache_shutdown(void); int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, @@ -56,6 +55,8 @@ int nfsd_mountpoint(struct dentry *, struct svc_export *); __be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *, struct nfs4_acl *); int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **); +__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, + struct xdr_netobj *); #endif /* CONFIG_NFSD_V4 */ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, @@ -92,17 +93,13 @@ __be32 nfsd_remove(struct svc_rqst *, struct svc_fh *, char *, int); __be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, char *name, int len); -int nfsd_truncate(struct svc_rqst *, struct svc_fh *, - unsigned long size); __be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, loff_t *, struct readdir_cd *, filldir_t); __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, struct kstatfs *, int access); -int nfsd_notify_change(struct inode *, struct iattr *); __be32 nfsd_permission(struct svc_rqst *, struct svc_export *, struct dentry *, int); -int nfsd_sync_dir(struct dentry *dp); #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 546f8983ecf1..b3ed6446ed8e 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -40,6 +40,7 @@ #include "state.h" #include "nfsd.h" +#define NFSD4_MAX_SEC_LABEL_LEN 2048 #define NFSD4_MAX_TAGLEN 128 #define XDR_LEN(n) (((n) + 3) & ~3) @@ -118,6 +119,7 @@ struct nfsd4_create { struct iattr cr_iattr; /* request */ struct nfsd4_change_info cr_cinfo; /* response */ struct nfs4_acl *cr_acl; + struct xdr_netobj cr_label; }; #define cr_linklen u.link.namelen #define cr_linkname u.link.name @@ -184,7 +186,6 @@ struct nfsd4_lock { #define lk_old_lock_stateid v.old.lock_stateid #define lk_old_lock_seqid v.old.lock_seqid -#define lk_rflags u.ok.rflags #define lk_resp_stateid u.ok.stateid #define lk_denied u.denied @@ -237,6 +238,7 @@ struct nfsd4_open { u32 op_share_deny; /* request */ u32 op_deleg_want; /* request */ stateid_t op_stateid; /* response */ + __be32 op_xdr_error; /* see nfsd4_open_omfg() */ u32 op_recall; /* recall */ struct nfsd4_change_info op_cinfo; /* response */ u32 op_rflags; /* response */ @@ -246,6 +248,7 @@ struct nfsd4_open { struct nfs4_file *op_file; /* used during processing */ struct nfs4_ol_stateid *op_stp; /* used during processing */ struct nfs4_acl *op_acl; + struct xdr_netobj op_label; }; #define op_iattr iattr @@ -330,6 +333,7 @@ struct nfsd4_setattr { u32 sa_bmval[3]; /* request */ struct iattr sa_iattr; /* request */ struct nfs4_acl *sa_acl; + struct xdr_netobj sa_label; }; struct nfsd4_setclientid { @@ -623,6 +627,7 @@ extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_test_stateid *test_stateid); extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid); +extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr); #endif /* diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h new file mode 100644 index 000000000000..c5c55dfb91a9 --- /dev/null +++ b/fs/nfsd/xdr4cb.h @@ -0,0 +1,23 @@ +#define NFS4_MAXTAGLEN 20 + +#define NFS4_enc_cb_null_sz 0 +#define NFS4_dec_cb_null_sz 0 +#define cb_compound_enc_hdr_sz 4 +#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) +#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2) +#define cb_sequence_enc_sz (sessionid_sz + 4 + \ + 1 /* no referring calls list yet */) +#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4) + +#define op_enc_sz 1 +#define op_dec_sz 2 +#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) +#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2) +#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + enc_stateid_sz + \ + enc_nfs4_fh_sz) + +#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index eed4d7b26249..741fd02e0444 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -398,6 +398,69 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, } /** + * nilfs_palloc_count_desc_blocks - count descriptor blocks number + * @inode: inode of metadata file using this allocator + * @desc_blocks: descriptor blocks number [out] + */ +static int nilfs_palloc_count_desc_blocks(struct inode *inode, + unsigned long *desc_blocks) +{ + unsigned long blknum; + int ret; + + ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum); + if (likely(!ret)) + *desc_blocks = DIV_ROUND_UP( + blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block); + return ret; +} + +/** + * nilfs_palloc_mdt_file_can_grow - check potential opportunity for + * MDT file growing + * @inode: inode of metadata file using this allocator + * @desc_blocks: known current descriptor blocks count + */ +static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode, + unsigned long desc_blocks) +{ + return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) < + nilfs_palloc_groups_count(inode); +} + +/** + * nilfs_palloc_count_max_entries - count max number of entries that can be + * described by descriptor blocks count + * @inode: inode of metadata file using this allocator + * @nused: current number of used entries + * @nmaxp: max number of entries [out] + */ +int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp) +{ + unsigned long desc_blocks = 0; + u64 entries_per_desc_block, nmax; + int err; + + err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks); + if (unlikely(err)) + return err; + + entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) * + nilfs_palloc_groups_per_desc_block(inode); + nmax = entries_per_desc_block * desc_blocks; + + if (nused == nmax && + nilfs_palloc_mdt_file_can_grow(inode, desc_blocks)) + nmax += entries_per_desc_block; + + if (nused > nmax) + return -ERANGE; + + *nmaxp = nmax; + return 0; +} + +/** * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the allocation diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index fb7238100548..4bd6451b5703 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -48,6 +48,8 @@ int nilfs_palloc_get_entry_block(struct inode *, __u64, int, void *nilfs_palloc_block_get_entry(const struct inode *, __u64, const struct buffer_head *, void *); +int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *); + /** * nilfs_palloc_req - persistent allocator request and reply * @pr_entry_nr: entry number (vblocknr or inode number) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index f30b017740a7..197a63e9d102 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -256,22 +256,18 @@ static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode) de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } -static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int nilfs_readdir(struct file *file, struct dir_context *ctx) { - loff_t pos = filp->f_pos; - struct inode *inode = file_inode(filp); + loff_t pos = ctx->pos; + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = dir_pages(inode); /* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */ - unsigned char *types = NULL; - int ret; if (pos > inode->i_size - NILFS_DIR_REC_LEN(1)) - goto success; - - types = nilfs_filetype_table; + return 0; for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; @@ -281,9 +277,8 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (IS_ERR(page)) { nilfs_error(sb, __func__, "bad page in #%lu", inode->i_ino); - filp->f_pos += PAGE_CACHE_SIZE - offset; - ret = -EIO; - goto done; + ctx->pos += PAGE_CACHE_SIZE - offset; + return -EIO; } kaddr = page_address(page); de = (struct nilfs_dir_entry *)(kaddr + offset); @@ -293,35 +288,28 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (de->rec_len == 0) { nilfs_error(sb, __func__, "zero-length directory entry"); - ret = -EIO; nilfs_put_page(page); - goto done; + return -EIO; } if (de->inode) { - int over; - unsigned char d_type = DT_UNKNOWN; + unsigned char t; - if (types && de->file_type < NILFS_FT_MAX) - d_type = types[de->file_type]; + if (de->file_type < NILFS_FT_MAX) + t = nilfs_filetype_table[de->file_type]; + else + t = DT_UNKNOWN; - offset = (char *)de - kaddr; - over = filldir(dirent, de->name, de->name_len, - (n<<PAGE_CACHE_SHIFT) | offset, - le64_to_cpu(de->inode), d_type); - if (over) { + if (!dir_emit(ctx, de->name, de->name_len, + le64_to_cpu(de->inode), t)) { nilfs_put_page(page); - goto success; + return 0; } } - filp->f_pos += nilfs_rec_len_from_disk(de->rec_len); + ctx->pos += nilfs_rec_len_from_disk(de->rec_len); } nilfs_put_page(page); } - -success: - ret = 0; -done: - return ret; + return 0; } /* @@ -678,7 +666,7 @@ not_empty: const struct file_operations nilfs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = nilfs_readdir, + .iterate = nilfs_readdir, .unlocked_ioctl = nilfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = nilfs_compat_ioctl, diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index d8e65bde083c..6548c7851b48 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -160,6 +160,28 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino, } /** + * nilfs_ifile_count_free_inodes - calculate free inodes count + * @ifile: ifile inode + * @nmaxinodes: current maximum of available inodes count [out] + * @nfreeinodes: free inodes count [out] + */ +int nilfs_ifile_count_free_inodes(struct inode *ifile, + u64 *nmaxinodes, u64 *nfreeinodes) +{ + u64 nused; + int err; + + *nmaxinodes = 0; + *nfreeinodes = 0; + + nused = atomic64_read(&NILFS_I(ifile)->i_root->inodes_count); + err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes); + if (likely(!err)) + *nfreeinodes = *nmaxinodes - nused; + return err; +} + +/** * nilfs_ifile_read - read or get ifile inode * @sb: super block instance * @root: root object diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 59b6f2b51df6..679674d13372 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **); int nilfs_ifile_delete_inode(struct inode *, ino_t); int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); +int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *); + int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, size_t inode_size, struct nilfs_inode *raw_inode, struct inode **inodep); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 6b49f14eac8c..b1a5277cfd18 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -25,7 +25,7 @@ #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/writeback.h> -#include <linux/uio.h> +#include <linux/aio.h> #include "nilfs.h" #include "btnode.h" #include "segment.h" @@ -54,7 +54,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n) inode_add_bytes(inode, (1 << inode->i_blkbits) * n); if (root) - atomic_add(n, &root->blocks_count); + atomic64_add(n, &root->blocks_count); } void nilfs_inode_sub_blocks(struct inode *inode, int n) @@ -63,7 +63,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n) inode_sub_bytes(inode, (1 << inode->i_blkbits) * n); if (root) - atomic_sub(n, &root->blocks_count); + atomic64_sub(n, &root->blocks_count); } /** @@ -175,6 +175,11 @@ static int nilfs_writepages(struct address_space *mapping, struct inode *inode = mapping->host; int err = 0; + if (inode->i_sb->s_flags & MS_RDONLY) { + nilfs_clear_dirty_pages(mapping, false); + return -EROFS; + } + if (wbc->sync_mode == WB_SYNC_ALL) err = nilfs_construct_dsync_segment(inode->i_sb, inode, wbc->range_start, @@ -187,6 +192,18 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc) struct inode *inode = page->mapping->host; int err; + if (inode->i_sb->s_flags & MS_RDONLY) { + /* + * It means that filesystem was remounted in read-only + * mode because of error or metadata corruption. But we + * have dirty pages that try to be flushed in background. + * So, here we simply discard this dirty page. + */ + nilfs_clear_dirty_page(page, false); + unlock_page(page); + return -EROFS; + } + redirty_page_for_writepage(wbc, page); unlock_page(page); @@ -202,13 +219,32 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc) static int nilfs_set_page_dirty(struct page *page) { - int ret = __set_page_dirty_buffers(page); + int ret = __set_page_dirty_nobuffers(page); - if (ret) { + if (page_has_buffers(page)) { struct inode *inode = page->mapping->host; - unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); + unsigned nr_dirty = 0; + struct buffer_head *bh, *head; - nilfs_set_file_dirty(inode, nr_dirty); + /* + * This page is locked by callers, and no other thread + * concurrently marks its buffers dirty since they are + * only dirtied through routines in fs/buffer.c in + * which call sites of mark_buffer_dirty are protected + * by page lock. + */ + bh = head = page_buffers(page); + do { + /* Do not mark hole blocks dirty */ + if (buffer_dirty(bh) || !buffer_mapped(bh)) + continue; + + set_buffer_dirty(bh); + nr_dirty++; + } while (bh = bh->b_this_page, bh != head); + + if (nr_dirty) + nilfs_set_file_dirty(inode, nr_dirty); } return ret; } @@ -333,7 +369,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) goto failed_ifile_create_inode; /* reference count of i_bh inherits from nilfs_mdt_read_block() */ - atomic_inc(&root->inodes_count); + atomic64_inc(&root->inodes_count); inode_init_owner(inode, dir, mode); inode->i_ino = ino; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; @@ -765,7 +801,7 @@ void nilfs_evict_inode(struct inode *inode) ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); if (!ret) - atomic_dec(&ii->i_root->inodes_count); + atomic64_dec(&ii->i_root->inodes_count); nilfs_clear_inode(inode); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index f9897d09c693..c4dcd1db57ee 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -375,14 +375,25 @@ int nilfs_mdt_fetch_dirty(struct inode *inode) static int nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) { - struct inode *inode; + struct inode *inode = page->mapping->host; struct super_block *sb; int err = 0; + if (inode && (inode->i_sb->s_flags & MS_RDONLY)) { + /* + * It means that filesystem was remounted in read-only + * mode because of error or metadata corruption. But we + * have dirty pages that try to be flushed in background. + * So, here we simply discard this dirty page. + */ + nilfs_clear_dirty_page(page, false); + unlock_page(page); + return -EROFS; + } + redirty_page_for_writepage(wbc, page); unlock_page(page); - inode = page->mapping->host; if (!inode) return 0; @@ -561,10 +572,10 @@ void nilfs_mdt_restore_from_shadow_map(struct inode *inode) if (mi->mi_palloc_cache) nilfs_palloc_clear_cache(inode); - nilfs_clear_dirty_pages(inode->i_mapping); + nilfs_clear_dirty_pages(inode->i_mapping, true); nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data); - nilfs_clear_dirty_pages(&ii->i_btnode_cache); + nilfs_clear_dirty_pages(&ii->i_btnode_cache, true); nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes); nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 07f76db04ec7..0ba679866e50 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -370,7 +370,12 @@ repeat: goto repeat; } -void nilfs_clear_dirty_pages(struct address_space *mapping) +/** + * nilfs_clear_dirty_pages - discard dirty pages in address space + * @mapping: address space with dirty pages for discarding + * @silent: suppress [true] or print [false] warning messages + */ +void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) { struct pagevec pvec; unsigned int i; @@ -382,25 +387,9 @@ void nilfs_clear_dirty_pages(struct address_space *mapping) PAGEVEC_SIZE)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; lock_page(page); - ClearPageUptodate(page); - ClearPageMappedToDisk(page); - bh = head = page_buffers(page); - do { - lock_buffer(bh); - clear_buffer_dirty(bh); - clear_buffer_nilfs_volatile(bh); - clear_buffer_nilfs_checked(bh); - clear_buffer_nilfs_redirected(bh); - clear_buffer_uptodate(bh); - clear_buffer_mapped(bh); - unlock_buffer(bh); - bh = bh->b_this_page; - } while (bh != head); - - __nilfs_clear_page_dirty(page); + nilfs_clear_dirty_page(page, silent); unlock_page(page); } pagevec_release(&pvec); @@ -408,6 +397,51 @@ void nilfs_clear_dirty_pages(struct address_space *mapping) } } +/** + * nilfs_clear_dirty_page - discard dirty page + * @page: dirty page that will be discarded + * @silent: suppress [true] or print [false] warning messages + */ +void nilfs_clear_dirty_page(struct page *page, bool silent) +{ + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + + BUG_ON(!PageLocked(page)); + + if (!silent) { + nilfs_warning(sb, __func__, + "discard page: offset %lld, ino %lu", + page_offset(page), inode->i_ino); + } + + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + lock_buffer(bh); + if (!silent) { + nilfs_warning(sb, __func__, + "discard block %llu, size %zu", + (u64)bh->b_blocknr, bh->b_size); + } + clear_buffer_dirty(bh); + clear_buffer_nilfs_volatile(bh); + clear_buffer_nilfs_checked(bh); + clear_buffer_nilfs_redirected(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + unlock_buffer(bh); + } while (bh = bh->b_this_page, bh != head); + } + + __nilfs_clear_page_dirty(page); +} + unsigned nilfs_page_count_clean_buffers(struct page *page, unsigned from, unsigned to) { diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index fb7de71605a0..ef30c5c2426f 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -55,7 +55,8 @@ void nilfs_page_bug(struct page *); int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); -void nilfs_clear_dirty_pages(struct address_space *); +void nilfs_clear_dirty_page(struct page *, bool); +void nilfs_clear_dirty_pages(struct address_space *, bool); void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, struct backing_dev_info *bdi); unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index a5752a589932..bd88a7461063 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -835,9 +835,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) raw_cp->cp_snapshot_list.ssl_next = 0; raw_cp->cp_snapshot_list.ssl_prev = 0; raw_cp->cp_inodes_count = - cpu_to_le64(atomic_read(&sci->sc_root->inodes_count)); + cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count)); raw_cp->cp_blocks_count = - cpu_to_le64(atomic_read(&sci->sc_root->blocks_count)); + cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count)); raw_cp->cp_nblk_inc = cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index c7d1f9f18b09..af3ba0478cdf 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -554,8 +554,10 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, if (err) goto failed_bh; - atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); - atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); + atomic64_set(&root->inodes_count, + le64_to_cpu(raw_cp->cp_inodes_count)); + atomic64_set(&root->blocks_count, + le64_to_cpu(raw_cp->cp_blocks_count)); nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); @@ -609,6 +611,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) unsigned long overhead; unsigned long nrsvblocks; sector_t nfreeblocks; + u64 nmaxinodes, nfreeinodes; int err; /* @@ -633,14 +636,34 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) if (unlikely(err)) return err; + err = nilfs_ifile_count_free_inodes(root->ifile, + &nmaxinodes, &nfreeinodes); + if (unlikely(err)) { + printk(KERN_WARNING + "NILFS warning: fail to count free inodes: err %d.\n", + err); + if (err == -ERANGE) { + /* + * If nilfs_palloc_count_max_entries() returns + * -ERANGE error code then we simply treat + * curent inodes count as maximum possible and + * zero as free inodes value. + */ + nmaxinodes = atomic64_read(&root->inodes_count); + nfreeinodes = 0; + err = 0; + } else + return err; + } + buf->f_type = NILFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = blocks - overhead; buf->f_bfree = nfreeblocks; buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? (buf->f_bfree - nrsvblocks) : 0; - buf->f_files = atomic_read(&root->inodes_count); - buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ + buf->f_files = nmaxinodes; + buf->f_ffree = nfreeinodes; buf->f_namelen = NILFS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); @@ -973,7 +996,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, static int nilfs_tree_was_touched(struct dentry *root_dentry) { - return root_dentry->d_count > 1; + return d_count(root_dentry) > 1; } /** diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 41e6a04a561f..94c451ce6d24 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -764,8 +764,8 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) new->ifile = NULL; new->nilfs = nilfs; atomic_set(&new->count, 1); - atomic_set(&new->inodes_count, 0); - atomic_set(&new->blocks_count, 0); + atomic64_set(&new->inodes_count, 0); + atomic64_set(&new->blocks_count, 0); rb_link_node(&new->rb_node, parent, p); rb_insert_color(&new->rb_node, &nilfs->ns_cptree); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index be1267a34cea..de8cc53b4a5c 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -241,8 +241,8 @@ struct nilfs_root { struct the_nilfs *nilfs; struct inode *ifile; - atomic_t inodes_count; - atomic_t blocks_count; + atomic64_t inodes_count; + atomic64_t blocks_count; }; /* Special checkpoint number */ diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 2bfe6dc413a0..1fedd5f7ccc4 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -31,7 +31,6 @@ int dir_notify_enable __read_mostly = 1; static struct kmem_cache *dnotify_struct_cache __read_mostly; static struct kmem_cache *dnotify_mark_cache __read_mostly; static struct fsnotify_group *dnotify_group __read_mostly; -static DEFINE_MUTEX(dnotify_mark_mutex); /* * dnotify will attach one of these to each inode (i_fsnotify_marks) which @@ -183,7 +182,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) return; dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); - mutex_lock(&dnotify_mark_mutex); + mutex_lock(&dnotify_group->mark_mutex); spin_lock(&fsn_mark->lock); prev = &dn_mark->dn; @@ -199,11 +198,12 @@ void dnotify_flush(struct file *filp, fl_owner_t id) spin_unlock(&fsn_mark->lock); - /* nothing else could have found us thanks to the dnotify_mark_mutex */ + /* nothing else could have found us thanks to the dnotify_groups + mark_mutex */ if (dn_mark->dn == NULL) - fsnotify_destroy_mark(fsn_mark, dnotify_group); + fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); - mutex_unlock(&dnotify_mark_mutex); + mutex_unlock(&dnotify_group->mark_mutex); fsnotify_put_mark(fsn_mark); } @@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) new_dn_mark->dn = NULL; /* this is needed to prevent the fcntl/close race described below */ - mutex_lock(&dnotify_mark_mutex); + mutex_lock(&dnotify_group->mark_mutex); /* add the new_fsn_mark or find an old one. */ fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); @@ -334,7 +334,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); } else { - fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0); + fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode, + NULL, 0); spin_lock(&new_fsn_mark->lock); fsn_mark = new_fsn_mark; dn_mark = new_dn_mark; @@ -348,9 +349,9 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) /* if (f != filp) means that we lost a race and another task/thread * actually closed the fd we are still playing with before we grabbed - * the dnotify_mark_mutex and fsn_mark->lock. Since closing the fd is the - * only time we clean up the marks we need to get our mark off - * the list. */ + * the dnotify_groups mark_mutex and fsn_mark->lock. Since closing the + * fd is the only time we clean up the marks we need to get our mark + * off the list. */ if (f != filp) { /* if we added ourselves, shoot ourselves, it's possible that * the flush actually did shoot this fsn_mark. That's fine too @@ -385,9 +386,9 @@ out: spin_unlock(&fsn_mark->lock); if (destroy) - fsnotify_destroy_mark(fsn_mark, dnotify_group); + fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); - mutex_unlock(&dnotify_mark_mutex); + mutex_unlock(&dnotify_group->mark_mutex); fsnotify_put_mark(fsn_mark); out_err: if (new_fsn_mark) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 5d8444268a16..e44cb6427df3 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/types.h> #include <linux/uaccess.h> +#include <linux/compat.h> #include <asm/ioctls.h> @@ -121,6 +122,7 @@ static int fill_event_metadata(struct fsnotify_group *group, metadata->event_len = FAN_EVENT_METADATA_LEN; metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; + metadata->reserved = 0; metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; metadata->pid = pid_vnr(event->tgid); if (unlikely(event->mask & FAN_Q_OVERFLOW)) @@ -398,9 +400,6 @@ static int fanotify_release(struct inode *ignored, struct file *file) wake_up(&group->fanotify_data.access_waitq); #endif - if (file->f_flags & FASYNC) - fsnotify_fasync(-1, file, 0); - /* matches the fanotify_init->fsnotify_alloc_group */ fsnotify_destroy_group(group); @@ -525,14 +524,18 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, __u32 removed; int destroy_mark; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); - if (!fsn_mark) + if (!fsn_mark) { + mutex_unlock(&group->mark_mutex); return -ENOENT; + } removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark(fsn_mark, group); + fsnotify_destroy_mark_locked(fsn_mark, group); + mutex_unlock(&group->mark_mutex); fsnotify_put_mark(fsn_mark); if (removed & real_mount(mnt)->mnt_fsnotify_mask) @@ -549,14 +552,19 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, __u32 removed; int destroy_mark; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_inode_mark(group, inode); - if (!fsn_mark) + if (!fsn_mark) { + mutex_unlock(&group->mark_mutex); return -ENOENT; + } removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark(fsn_mark, group); + fsnotify_destroy_mark_locked(fsn_mark, group); + mutex_unlock(&group->mark_mutex); + /* matches the fsnotify_find_inode_mark() */ fsnotify_put_mark(fsn_mark); if (removed & inode->i_fsnotify_mask) @@ -592,35 +600,55 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, return mask & ~oldmask; } +static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, + struct inode *inode, + struct vfsmount *mnt) +{ + struct fsnotify_mark *mark; + int ret; + + if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + return ERR_PTR(-ENOSPC); + + mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); + if (!mark) + return ERR_PTR(-ENOMEM); + + fsnotify_init_mark(mark, fanotify_free_mark); + ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0); + if (ret) { + fsnotify_put_mark(mark); + return ERR_PTR(ret); + } + + return mark; +} + + static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt, __u32 mask, unsigned int flags) { struct fsnotify_mark *fsn_mark; __u32 added; - int ret = 0; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); if (!fsn_mark) { - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) - return -ENOSPC; - - fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!fsn_mark) - return -ENOMEM; - - fsnotify_init_mark(fsn_mark, fanotify_free_mark); - ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0); - if (ret) - goto err; + fsn_mark = fanotify_add_new_mark(group, NULL, mnt); + if (IS_ERR(fsn_mark)) { + mutex_unlock(&group->mark_mutex); + return PTR_ERR(fsn_mark); + } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); + mutex_unlock(&group->mark_mutex); if (added & ~real_mount(mnt)->mnt_fsnotify_mask) fsnotify_recalc_vfsmount_mask(mnt); -err: + fsnotify_put_mark(fsn_mark); - return ret; + return 0; } static int fanotify_add_inode_mark(struct fsnotify_group *group, @@ -629,7 +657,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, { struct fsnotify_mark *fsn_mark; __u32 added; - int ret = 0; pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); @@ -643,27 +670,23 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, (atomic_read(&inode->i_writecount) > 0)) return 0; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_inode_mark(group, inode); if (!fsn_mark) { - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) - return -ENOSPC; - - fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!fsn_mark) - return -ENOMEM; - - fsnotify_init_mark(fsn_mark, fanotify_free_mark); - ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0); - if (ret) - goto err; + fsn_mark = fanotify_add_new_mark(group, inode, NULL); + if (IS_ERR(fsn_mark)) { + mutex_unlock(&group->mark_mutex); + return PTR_ERR(fsn_mark); + } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); + mutex_unlock(&group->mark_mutex); if (added & ~inode->i_fsnotify_mask) fsnotify_recalc_inode_mask(inode); -err: + fsnotify_put_mark(fsn_mark); - return ret; + return 0; } /* fanotify syscalls */ @@ -755,9 +778,9 @@ out_destroy_group: return fd; } -SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, - __u64 mask, int dfd, - const char __user * pathname) +SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, + __u64, mask, int, dfd, + const char __user *, pathname) { struct inode *inode = NULL; struct vfsmount *mnt = NULL; @@ -857,15 +880,20 @@ fput_and_out: return ret; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_fanotify_mark(long fanotify_fd, long flags, __u64 mask, - long dfd, long pathname) +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE6(fanotify_mark, + int, fanotify_fd, unsigned int, flags, + __u32, mask0, __u32, mask1, int, dfd, + const char __user *, pathname) { - return SYSC_fanotify_mark((int) fanotify_fd, (unsigned int) flags, - mask, (int) dfd, - (const char __user *) pathname); + return sys_fanotify_mark(fanotify_fd, flags, +#ifdef __BIG_ENDIAN + ((__u64)mask1 << 32) | mask0, +#else + ((__u64)mask0 << 32) | mask1, +#endif + dfd, pathname); } -SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark); #endif /* diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index e0f7c1241a6a..60f954a891ab 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -287,9 +287,6 @@ static int inotify_release(struct inode *ignored, struct file *file) pr_debug("%s: group=%p\n", __func__, group); - if (file->f_flags & FASYNC) - fsnotify_fasync(-1, file, 0); - /* free this group, matching get was inotify_init->fsnotify_obtain_group */ fsnotify_destroy_group(group); @@ -359,7 +356,6 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns } static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock, - int *last_wd, struct inotify_inode_mark *i_mark) { int ret; @@ -367,11 +363,10 @@ static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock, idr_preload(GFP_KERNEL); spin_lock(idr_lock); - ret = idr_alloc(idr, i_mark, *last_wd + 1, 0, GFP_NOWAIT); + ret = idr_alloc_cyclic(idr, i_mark, 1, 0, GFP_NOWAIT); if (ret >= 0) { /* we added the mark to the idr, take a reference */ i_mark->wd = ret; - *last_wd = i_mark->wd; fsnotify_get_mark(&i_mark->fsn_mark); } @@ -572,7 +567,6 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, int add = (arg & IN_MASK_ADD); int ret; - /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); fsn_mark = fsnotify_find_inode_mark(group, inode); @@ -623,7 +617,6 @@ static int inotify_new_watch(struct fsnotify_group *group, struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; - /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); @@ -638,13 +631,13 @@ static int inotify_new_watch(struct fsnotify_group *group, if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) goto out_err; - ret = inotify_add_to_idr(idr, idr_lock, &group->inotify_data.last_wd, - tmp_i_mark); + ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark); if (ret) goto out_err; /* we are on the idr, now get on the inode */ - ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0); + ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode, + NULL, 0); if (ret) { /* we failed to get on the inode, get off the idr */ inotify_remove_from_idr(group, tmp_i_mark); @@ -668,19 +661,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod { int ret = 0; -retry: + mutex_lock(&group->mark_mutex); /* try to update and existing watch with the new arg */ ret = inotify_update_existing_watch(group, inode, arg); /* no mark present, try to add a new one */ if (ret == -ENOENT) ret = inotify_new_watch(group, inode, arg); - /* - * inotify_new_watch could race with another thread which did an - * inotify_new_watch between the update_existing and the add watch - * here, go back and try to update an existing mark again. - */ - if (ret == -EEXIST) - goto retry; + mutex_unlock(&group->mark_mutex); return ret; } @@ -697,7 +684,6 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); - group->inotify_data.last_wd = 0; group->inotify_data.user = get_current_user(); if (atomic_inc_return(&group->inotify_data.user->inotify_devs) > @@ -751,6 +737,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, int ret; unsigned flags = 0; + /* don't allow invalid bits: we don't want flags set */ + if (unlikely(!(mask & ALL_INOTIFY_BITS))) + return -EINVAL; + f = fdget(fd); if (unlikely(!f.file)) return -EBADF; diff --git a/fs/notify/mark.c b/fs/notify/mark.c index fc6b49bf7360..923fe4a5f503 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -20,28 +20,29 @@ * fsnotify inode mark locking/lifetime/and refcnting * * REFCNT: - * The mark->refcnt tells how many "things" in the kernel currently are - * referencing this object. The object typically will live inside the kernel - * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task - * which can find this object holding the appropriete locks, can take a reference - * and the object itself is guaranteed to survive until the reference is dropped. + * The group->recnt and mark->refcnt tell how many "things" in the kernel + * currently are referencing the objects. Both kind of objects typically will + * live inside the kernel with a refcnt of 2, one for its creation and one for + * the reference a group and a mark hold to each other. + * If you are holding the appropriate locks, you can take a reference and the + * object itself is guaranteed to survive until the reference is dropped. * * LOCKING: - * There are 3 spinlocks involved with fsnotify inode marks and they MUST - * be taken in order as follows: + * There are 3 locks involved with fsnotify inode marks and they MUST be taken + * in order as follows: * + * group->mark_mutex * mark->lock - * group->mark_lock * inode->i_lock * - * mark->lock protects 2 things, mark->group and mark->inode. You must hold - * that lock to dereference either of these things (they could be NULL even with - * the lock) - * - * group->mark_lock protects the marks_list anchored inside a given group - * and each mark is hooked via the g_list. It also sorta protects the - * free_g_list, which when used is anchored by a private list on the stack of the - * task which held the group->mark_lock. + * group->mark_mutex protects the marks_list anchored inside a given group and + * each mark is hooked via the g_list. It also protects the groups private + * data (i.e group limits). + + * mark->lock protects the marks attributes like its masks and flags. + * Furthermore it protects the access to a reference of the group that the mark + * is assigned to as well as the access to a reference of the inode/vfsmount + * that is being watched by the mark. * * inode->i_lock protects the i_fsnotify_marks list anchored inside a * given inode and each mark is hooked via the i_list. (and sorta the @@ -64,18 +65,11 @@ * inode. We take i_lock and walk the i_fsnotify_marks safely. For each * mark on the list we take a reference (so the mark can't disappear under us). * We remove that mark form the inode's list of marks and we add this mark to a - * private list anchored on the stack using i_free_list; At this point we no - * longer fear anything finding the mark using the inode's list of marks. - * - * We can safely and locklessly run the private list on the stack of everything - * we just unattached from the original inode. For each mark on the private list - * we grab the mark-> and can thus dereference mark->group and mark->inode. If - * we see the group and inode are not NULL we take those locks. Now holding all - * 3 locks we can completely remove the mark from other tasks finding it in the - * future. Remember, 10 things might already be referencing this mark, but they - * better be holding a ref. We drop our reference we took before we unhooked it - * from the inode. When the ref hits 0 we can free the mark. - * + * private list anchored on the stack using i_free_list; we walk i_free_list + * and before we destroy the mark we make sure that we dont race with a + * concurrent destroy_group by getting a ref to the marks group and taking the + * groups mutex. + * Very similarly for freeing by group, except we use free_g_list. * * This has the very interesting property of being able to run concurrently with diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index fa9c05f97af4..d267ea6aa1a0 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1372,7 +1372,7 @@ retry_writepage: * The page may have dirty, unmapped buffers. Make them * freeable here, so the page does not leak. */ - block_invalidatepage(page, 0); + block_invalidatepage(page, 0, PAGE_CACHE_SIZE); unlock_page(page); ntfs_debug("Write outside i_size - truncated?"); return 0; diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index aa411c3f20e9..9e38dafa3bc7 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1004,13 +1004,11 @@ dir_err_out: /** * ntfs_filldir - ntfs specific filldir method * @vol: current ntfs volume - * @fpos: position in the directory * @ndir: ntfs inode of current directory * @ia_page: page in which the index allocation buffer @ie is in resides * @ie: current index entry * @name: buffer to use for the converted name - * @dirent: vfs filldir callback context - * @filldir: vfs filldir callback + * @actor: what to feed the entries to * * Convert the Unicode @name to the loaded NLS and pass it to the @filldir * callback. @@ -1024,12 +1022,12 @@ dir_err_out: * retake the lock if we are returning a non-zero value as ntfs_readdir() * would need to drop the lock immediately anyway. */ -static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos, +static inline int ntfs_filldir(ntfs_volume *vol, ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie, - u8 *name, void *dirent, filldir_t filldir) + u8 *name, struct dir_context *actor) { unsigned long mref; - int name_len, rc; + int name_len; unsigned dt_type; FILE_NAME_TYPE_FLAGS name_type; @@ -1068,13 +1066,14 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos, if (ia_page) unlock_page(ia_page); ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode " - "0x%lx, DT_%s.", name, name_len, fpos, mref, + "0x%lx, DT_%s.", name, name_len, actor->pos, mref, dt_type == DT_DIR ? "DIR" : "REG"); - rc = filldir(dirent, name, name_len, fpos, mref, dt_type); + if (!dir_emit(actor, name, name_len, mref, dt_type)) + return 1; /* Relock the page but not if we are aborting ->readdir. */ - if (!rc && ia_page) + if (ia_page) lock_page(ia_page); - return rc; + return 0; } /* @@ -1097,11 +1096,11 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos, * removes them again after the write is complete after which it * unlocks the page. */ -static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int ntfs_readdir(struct file *file, struct dir_context *actor) { s64 ia_pos, ia_start, prev_ia_pos, bmp_pos; - loff_t fpos, i_size; - struct inode *bmp_vi, *vdir = file_inode(filp); + loff_t i_size; + struct inode *bmp_vi, *vdir = file_inode(file); struct super_block *sb = vdir->i_sb; ntfs_inode *ndir = NTFS_I(vdir); ntfs_volume *vol = NTFS_SB(sb); @@ -1116,33 +1115,16 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) u8 *kaddr, *bmp, *index_end; ntfs_attr_search_ctx *ctx; - fpos = filp->f_pos; ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.", - vdir->i_ino, fpos); + vdir->i_ino, actor->pos); rc = err = 0; /* Are we at end of dir yet? */ i_size = i_size_read(vdir); - if (fpos >= i_size + vol->mft_record_size) - goto done; + if (actor->pos >= i_size + vol->mft_record_size) + return 0; /* Emulate . and .. for all directories. */ - if (!fpos) { - ntfs_debug("Calling filldir for . with len 1, fpos 0x0, " - "inode 0x%lx, DT_DIR.", vdir->i_ino); - rc = filldir(dirent, ".", 1, fpos, vdir->i_ino, DT_DIR); - if (rc) - goto done; - fpos++; - } - if (fpos == 1) { - ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, " - "inode 0x%lx, DT_DIR.", - (unsigned long)parent_ino(filp->f_path.dentry)); - rc = filldir(dirent, "..", 2, fpos, - parent_ino(filp->f_path.dentry), DT_DIR); - if (rc) - goto done; - fpos++; - } + if (!dir_emit_dots(file, actor)) + return 0; m = NULL; ctx = NULL; /* @@ -1155,7 +1137,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) goto err_out; } /* Are we jumping straight into the index allocation attribute? */ - if (fpos >= vol->mft_record_size) + if (actor->pos >= vol->mft_record_size) goto skip_index_root; /* Get hold of the mft record for the directory. */ m = map_mft_record(ndir); @@ -1170,7 +1152,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) goto err_out; } /* Get the offset into the index root attribute. */ - ir_pos = (s64)fpos; + ir_pos = (s64)actor->pos; /* Find the index root attribute in the mft record. */ err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, 0, ctx); @@ -1226,10 +1208,9 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (ir_pos > (u8*)ie - (u8*)ir) continue; /* Advance the position even if going to skip the entry. */ - fpos = (u8*)ie - (u8*)ir; + actor->pos = (u8*)ie - (u8*)ir; /* Submit the name to the filldir callback. */ - rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent, - filldir); + rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor); if (rc) { kfree(ir); goto abort; @@ -1242,12 +1223,12 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (!NInoIndexAllocPresent(ndir)) goto EOD; /* Advance fpos to the beginning of the index allocation. */ - fpos = vol->mft_record_size; + actor->pos = vol->mft_record_size; skip_index_root: kaddr = NULL; prev_ia_pos = -1LL; /* Get the offset into the index allocation attribute. */ - ia_pos = (s64)fpos - vol->mft_record_size; + ia_pos = (s64)actor->pos - vol->mft_record_size; ia_mapping = vdir->i_mapping; ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino); bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4); @@ -1409,7 +1390,7 @@ find_next_index_buffer: if (ia_pos - ia_start > (u8*)ie - (u8*)ia) continue; /* Advance the position even if going to skip the entry. */ - fpos = (u8*)ie - (u8*)ia + + actor->pos = (u8*)ie - (u8*)ia + (sle64_to_cpu(ia->index_block_vcn) << ndir->itype.index.vcn_size_bits) + vol->mft_record_size; @@ -1419,8 +1400,7 @@ find_next_index_buffer: * before returning, unless a non-zero value is returned in * which case the page is left unlocked. */ - rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent, - filldir); + rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor); if (rc) { /* @ia_page is already unlocked in this case. */ ntfs_unmap_page(ia_page); @@ -1439,18 +1419,9 @@ unm_EOD: iput(bmp_vi); EOD: /* We are finished, set fpos to EOD. */ - fpos = i_size + vol->mft_record_size; + actor->pos = i_size + vol->mft_record_size; abort: kfree(name); -done: -#ifdef DEBUG - if (!rc) - ntfs_debug("EOD, fpos 0x%llx, returning 0.", fpos); - else - ntfs_debug("filldir returned %i, fpos 0x%llx, returning 0.", - rc, fpos); -#endif - filp->f_pos = fpos; return 0; err_out: if (bmp_page) { @@ -1471,7 +1442,6 @@ iput_err_out: if (!err) err = -EIO; ntfs_debug("Failed. Returning error code %i.", -err); - filp->f_pos = fpos; return err; } @@ -1571,7 +1541,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, const struct file_operations ntfs_dir_ops = { .llseek = generic_file_llseek, /* Seek inside directory. */ .read = generic_read_dir, /* Return -EISDIR. */ - .readdir = ntfs_readdir, /* Read directory contents. */ + .iterate = ntfs_readdir, /* Read directory contents. */ #ifdef NTFS_RW .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ /*.aio_fsync = ,*/ /* Sync all outstanding async diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 5b2d4f0853ac..c5670b8d198c 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -27,6 +27,7 @@ #include <linux/swap.h> #include <linux/uio.h> #include <linux/writeback.h> +#include <linux/aio.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -2129,7 +2130,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); - sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); @@ -2138,7 +2138,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0) ret = err; } - sb_end_write(inode->i_sb); return ret; } diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index d3e118cc6ffa..2778b0255dc6 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -28,6 +28,7 @@ #include <linux/quotaops.h> #include <linux/slab.h> #include <linux/log2.h> +#include <linux/aio.h> #include "aops.h" #include "attrib.h" diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index b8a9d87231b1..17e6bdde96c5 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5655,7 +5655,7 @@ int ocfs2_remove_btree_range(struct inode *inode, &ref_tree, NULL); if (ret) { mlog_errno(ret); - goto out; + goto bail; } ret = ocfs2_prepare_refcount_change_for_del(inode, @@ -5666,7 +5666,7 @@ int ocfs2_remove_btree_range(struct inode *inode, &extra_blocks); if (ret < 0) { mlog_errno(ret); - goto out; + goto bail; } } @@ -5674,7 +5674,7 @@ int ocfs2_remove_btree_range(struct inode *inode, extra_blocks); if (ret) { mlog_errno(ret); - return ret; + goto bail; } mutex_lock(&tl_inode->i_mutex); @@ -5734,7 +5734,7 @@ out_commit: ocfs2_commit_trans(osb, handle); out: mutex_unlock(&tl_inode->i_mutex); - +bail: if (meta_ac) ocfs2_free_alloc_context(meta_ac); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 20dfec72e903..79736a28d84f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -603,11 +603,12 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, * from ext3. PageChecked() bits have been removed as OCFS2 does not * do journalled data. */ -static void ocfs2_invalidatepage(struct page *page, unsigned long offset) +static void ocfs2_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; - jbd2_journal_invalidatepage(journal, page, offset); + jbd2_journal_invalidatepage(journal, page, offset, length); } static int ocfs2_releasepage(struct page *page, gfp_t wait) diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index ffb2da370a99..f671e49beb34 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -22,6 +22,8 @@ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H +#include <linux/aio.h> + handle_t *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, unsigned from, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 42252bf64b51..5c1c864e81cc 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -176,7 +176,7 @@ static void o2hb_dead_threshold_set(unsigned int threshold) } } -static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode) +static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode) { int ret = -1; @@ -500,7 +500,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, } atomic_inc(&write_wc->wc_num_reqs); - submit_bio(WRITE, bio); + submit_bio(WRITE_SYNC, bio); status = 0; bail: @@ -2271,7 +2271,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len)) continue; - ret = o2hb_global_hearbeat_mode_set(i); + ret = o2hb_global_heartbeat_mode_set(i); if (!ret) printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n", o2hb_heartbeat_mode_desc[i]); @@ -2304,7 +2304,7 @@ static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { NULL, }; -static struct configfs_item_operations o2hb_hearbeat_group_item_ops = { +static struct configfs_item_operations o2hb_heartbeat_group_item_ops = { .show_attribute = o2hb_heartbeat_group_show, .store_attribute = o2hb_heartbeat_group_store, }; @@ -2316,7 +2316,7 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { static struct config_item_type o2hb_heartbeat_group_type = { .ct_group_ops = &o2hb_heartbeat_group_group_ops, - .ct_item_ops = &o2hb_hearbeat_group_item_ops, + .ct_item_ops = &o2hb_heartbeat_group_item_ops, .ct_attrs = o2hb_heartbeat_group_attrs, .ct_owner = THIS_MODULE, }; @@ -2389,6 +2389,9 @@ static int o2hb_region_pin(const char *region_uuid) assert_spin_locked(&o2hb_live_lock); list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + if (reg->hr_item_dropped) + continue; + uuid = config_item_name(®->hr_item); /* local heartbeat */ @@ -2439,6 +2442,9 @@ static void o2hb_region_unpin(const char *region_uuid) assert_spin_locked(&o2hb_live_lock); list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + if (reg->hr_item_dropped) + continue; + uuid = config_item_name(®->hr_item); if (region_uuid) { if (strcmp(region_uuid, uuid)) @@ -2654,6 +2660,9 @@ int o2hb_get_all_regions(char *region_uuids, u8 max_regions) p = region_uuids; list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + if (reg->hr_item_dropped) + continue; + mlog(0, "Region: %s\n", config_item_name(®->hr_item)); if (numregs < max_regions) { memcpy(p, config_item_name(®->hr_item), diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index c19897d0fe14..1ec141e758d7 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -264,7 +264,7 @@ void o2quo_hb_still_up(u8 node) /* This is analogous to hb_up. as a node's connection comes up we delay the * quorum decision until we see it heartbeating. the hold will be droped in * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if - * it's already heartbeating we we might be dropping a hold that conn_up got. + * it's already heartbeating we might be dropping a hold that conn_up got. * */ void o2quo_conn_up(u8 node) { diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index aa88bd8bcedc..d644dc611425 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -406,6 +406,9 @@ static void sc_kref_release(struct kref *kref) sc->sc_node = NULL; o2net_debug_del_sc(sc); + + if (sc->sc_page) + __free_page(sc->sc_page); kfree(sc); } @@ -630,19 +633,19 @@ static void o2net_state_change(struct sock *sk) state_change = sc->sc_state_change; switch(sk->sk_state) { - /* ignore connecting sockets as they make progress */ - case TCP_SYN_SENT: - case TCP_SYN_RECV: - break; - case TCP_ESTABLISHED: - o2net_sc_queue_work(sc, &sc->sc_connect_work); - break; - default: - printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT - " shutdown, state %d\n", - SC_NODEF_ARGS(sc), sk->sk_state); - o2net_sc_queue_work(sc, &sc->sc_shutdown_work); - break; + /* ignore connecting sockets as they make progress */ + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + case TCP_ESTABLISHED: + o2net_sc_queue_work(sc, &sc->sc_connect_work); + break; + default: + printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT + " shutdown, state %d\n", + SC_NODEF_ARGS(sc), sk->sk_state); + o2net_sc_queue_work(sc, &sc->sc_shutdown_work); + break; } out: read_unlock(&sk->sk_callback_lock); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index f1e1aed8f638..eb760d8acd50 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1761,11 +1761,10 @@ bail: static int ocfs2_dir_foreach_blk_id(struct inode *inode, u64 *f_version, - loff_t *f_pos, void *priv, - filldir_t filldir, int *filldir_err) + struct dir_context *ctx) { - int ret, i, filldir_ret; - unsigned long offset = *f_pos; + int ret, i; + unsigned long offset = ctx->pos; struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_inline_data *data; @@ -1781,8 +1780,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, di = (struct ocfs2_dinode *)di_bh->b_data; data = &di->id2.i_data; - while (*f_pos < i_size_read(inode)) { -revalidate: + while (ctx->pos < i_size_read(inode)) { /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block @@ -1802,50 +1800,31 @@ revalidate: break; i += le16_to_cpu(de->rec_len); } - *f_pos = offset = i; + ctx->pos = offset = i; *f_version = inode->i_version; } - de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos); - if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) { + de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos); + if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) { /* On error, skip the f_pos to the end. */ - *f_pos = i_size_read(inode); - goto out; + ctx->pos = i_size_read(inode); + break; } offset += le16_to_cpu(de->rec_len); if (le64_to_cpu(de->inode)) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. - */ - u64 version = *f_version; unsigned char d_type = DT_UNKNOWN; if (de->file_type < OCFS2_FT_MAX) d_type = ocfs2_filetype_table[de->file_type]; - filldir_ret = filldir(priv, de->name, - de->name_len, - *f_pos, - le64_to_cpu(de->inode), - d_type); - if (filldir_ret) { - if (filldir_err) - *filldir_err = filldir_ret; - break; - } - if (version != *f_version) - goto revalidate; + if (!dir_emit(ctx, de->name, de->name_len, + le64_to_cpu(de->inode), d_type)) + goto out; } - *f_pos += le16_to_cpu(de->rec_len); + ctx->pos += le16_to_cpu(de->rec_len); } - out: brelse(di_bh); - return 0; } @@ -1855,27 +1834,26 @@ out: */ static int ocfs2_dir_foreach_blk_el(struct inode *inode, u64 *f_version, - loff_t *f_pos, void *priv, - filldir_t filldir, int *filldir_err) + struct dir_context *ctx, + bool persist) { - int error = 0; unsigned long offset, blk, last_ra_blk = 0; - int i, stored; + int i; struct buffer_head * bh, * tmp; struct ocfs2_dir_entry * de; struct super_block * sb = inode->i_sb; unsigned int ra_sectors = 16; + int stored = 0; - stored = 0; bh = NULL; - offset = (*f_pos) & (sb->s_blocksize - 1); + offset = ctx->pos & (sb->s_blocksize - 1); - while (!error && !stored && *f_pos < i_size_read(inode)) { - blk = (*f_pos) >> sb->s_blocksize_bits; + while (ctx->pos < i_size_read(inode)) { + blk = ctx->pos >> sb->s_blocksize_bits; if (ocfs2_read_dir_block(inode, blk, &bh, 0)) { /* Skip the corrupt dirblock and keep trying */ - *f_pos += sb->s_blocksize - offset; + ctx->pos += sb->s_blocksize - offset; continue; } @@ -1897,7 +1875,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, ra_sectors = 8; } -revalidate: /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block @@ -1917,93 +1894,64 @@ revalidate: i += le16_to_cpu(de->rec_len); } offset = i; - *f_pos = ((*f_pos) & ~(sb->s_blocksize - 1)) + ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; *f_version = inode->i_version; } - while (!error && *f_pos < i_size_read(inode) + while (ctx->pos < i_size_read(inode) && offset < sb->s_blocksize) { de = (struct ocfs2_dir_entry *) (bh->b_data + offset); if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { /* On error, skip the f_pos to the next block. */ - *f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1; + ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; brelse(bh); - goto out; + continue; } - offset += le16_to_cpu(de->rec_len); if (le64_to_cpu(de->inode)) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. - */ - unsigned long version = *f_version; unsigned char d_type = DT_UNKNOWN; if (de->file_type < OCFS2_FT_MAX) d_type = ocfs2_filetype_table[de->file_type]; - error = filldir(priv, de->name, + if (!dir_emit(ctx, de->name, de->name_len, - *f_pos, le64_to_cpu(de->inode), - d_type); - if (error) { - if (filldir_err) - *filldir_err = error; - break; + d_type)) { + brelse(bh); + return 0; } - if (version != *f_version) - goto revalidate; - stored ++; + stored++; } - *f_pos += le16_to_cpu(de->rec_len); + offset += le16_to_cpu(de->rec_len); + ctx->pos += le16_to_cpu(de->rec_len); } offset = 0; brelse(bh); bh = NULL; + if (!persist && stored) + break; } - - stored = 0; -out: - return stored; + return 0; } static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version, - loff_t *f_pos, void *priv, filldir_t filldir, - int *filldir_err) + struct dir_context *ctx, + bool persist) { if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) - return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv, - filldir, filldir_err); - - return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir, - filldir_err); + return ocfs2_dir_foreach_blk_id(inode, f_version, ctx); + return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist); } /* * This is intended to be called from inside other kernel functions, * so we fake some arguments. */ -int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv, - filldir_t filldir) +int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx) { - int ret = 0, filldir_err = 0; u64 version = inode->i_version; - - while (*f_pos < i_size_read(inode)) { - ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv, - filldir, &filldir_err); - if (ret || filldir_err) - break; - } - - if (ret > 0) - ret = -EIO; - + ocfs2_dir_foreach_blk(inode, &version, ctx, true); return 0; } @@ -2011,15 +1959,15 @@ int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv, * ocfs2_readdir() * */ -int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) +int ocfs2_readdir(struct file *file, struct dir_context *ctx) { int error = 0; - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); int lock_level = 0; trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); - error = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level); + error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level); if (lock_level && error >= 0) { /* We release EX lock which used to update atime * and get PR lock again to reduce contention @@ -2035,8 +1983,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) goto bail_nolock; } - error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos, - dirent, filldir, NULL); + error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false); ocfs2_inode_unlock(inode, lock_level); if (error) @@ -2120,6 +2067,7 @@ bail: } struct ocfs2_empty_dir_priv { + struct dir_context ctx; unsigned seen_dot; unsigned seen_dot_dot; unsigned seen_other; @@ -2204,8 +2152,9 @@ out: int ocfs2_empty_dir(struct inode *inode) { int ret; - loff_t start = 0; - struct ocfs2_empty_dir_priv priv; + struct ocfs2_empty_dir_priv priv = { + .ctx.actor = ocfs2_empty_dir_filldir + }; memset(&priv, 0, sizeof(priv)); @@ -2219,7 +2168,7 @@ int ocfs2_empty_dir(struct inode *inode) */ } - ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir); + ret = ocfs2_dir_foreach(inode, &priv.ctx); if (ret) mlog_errno(ret); diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h index e683f3deb645..f0344b75b14d 100644 --- a/fs/ocfs2/dir.h +++ b/fs/ocfs2/dir.h @@ -92,9 +92,8 @@ int ocfs2_find_files_on_disk(const char *name, struct ocfs2_dir_lookup_result *res); int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, int namelen, u64 *blkno); -int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); -int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv, - filldir_t filldir); +int ocfs2_readdir(struct file *file, struct dir_context *ctx); +int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx); int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, struct inode *dir, struct buffer_head *parent_fe_bh, diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 975810b98492..47e67c2d228f 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -178,6 +178,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm, lock->ml.node); } } else { + status = DLM_NORMAL; dlm_lock_get(lock); list_add_tail(&lock->list, &res->blocked); kick_thread = 1; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index eeac97bb3bfa..773bd32bfd8c 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -55,9 +55,6 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); static int dlm_recovery_thread(void *data); -void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); -int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); -void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); static int dlm_do_recovery(struct dlm_ctxt *dlm); static int dlm_pick_recovery_master(struct dlm_ctxt *dlm); @@ -789,7 +786,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, u8 dead_node) { struct dlm_lock_request lr; - enum dlm_status ret; + int ret; mlog(0, "\n"); @@ -802,7 +799,6 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, lr.dead_node = dead_node; // send message - ret = DLM_NOLOCKMGR; ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, &lr, sizeof(lr), request_from, NULL); @@ -1408,6 +1404,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, mres->lockname_len, mres->lockname); ret = -EFAULT; spin_unlock(&res->spinlock); + dlm_lockres_put(res); goto leave; } res->state |= DLM_LOCK_RES_MIGRATING; @@ -1498,10 +1495,8 @@ leave: dlm_put(dlm); if (ret < 0) { - if (buf) - kfree(buf); - if (item) - kfree(item); + kfree(buf); + kfree(item); mlog_errno(ret); } @@ -2697,6 +2692,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, dlm->name, br->node_idx, br->dead_node, dlm->reco.dead_node, dlm->reco.new_master); spin_unlock(&dlm->spinlock); + dlm_put(dlm); return -EAGAIN; } spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 12ae194ac943..3a44a648dae7 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2322,7 +2322,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags, subclass, _RET_IP_); if (status < 0) { - if (status != -EAGAIN && status != -EIOCBRETRY) + if (status != -EAGAIN) mlog_errno(status); goto bail; } diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 1c39efb71bab..2487116d0d33 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -790,7 +790,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, &hole_size, &rec, &is_last); if (ret) { mlog_errno(ret); - goto out; + goto out_unlock; } if (rec.e_blkno == 0ULL) { diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6474cb44004d..41000f223ca4 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2248,8 +2248,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, if (iocb->ki_left == 0) return 0; - sb_start_write(inode->i_sb); - appending = file->f_flags & O_APPEND ? 1 : 0; direct_io = file->f_flags & O_DIRECT ? 1 : 0; @@ -2290,7 +2288,7 @@ relock: ret = ocfs2_inode_lock(inode, NULL, 1); if (ret < 0) { mlog_errno(ret); - goto out_sems; + goto out; } ocfs2_inode_unlock(inode, 1); @@ -2423,7 +2421,6 @@ out_sems: ocfs2_iocb_clear_sem_locked(iocb); mutex_unlock(&inode->i_mutex); - sb_end_write(inode->i_sb); if (written) ret = written; @@ -2468,8 +2465,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, out->f_path.dentry->d_name.len, out->f_path.dentry->d_name.name, len); - if (pipe->inode) - mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); + pipe_lock(pipe); splice_from_pipe_begin(&sd); do { @@ -2489,8 +2485,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, } while (ret > 0); splice_from_pipe_end(pipe, &sd); - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); if (sd.num_spliced) ret = sd.num_spliced; @@ -2651,17 +2646,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) goto out; } - if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) - ret = -EINVAL; - if (!ret && offset > inode->i_sb->s_maxbytes) - ret = -EINVAL; - if (ret) - goto out; - - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: mutex_unlock(&inode->i_mutex); @@ -2717,7 +2702,7 @@ const struct file_operations ocfs2_fops = { const struct file_operations ocfs2_dops = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = ocfs2_readdir, + .iterate = ocfs2_readdir, .fsync = ocfs2_sync_file, .release = ocfs2_dir_release, .open = ocfs2_dir_open, @@ -2764,7 +2749,7 @@ const struct file_operations ocfs2_fops_no_plocks = { const struct file_operations ocfs2_dops_no_plocks = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = ocfs2_readdir, + .iterate = ocfs2_readdir, .fsync = ocfs2_sync_file, .release = ocfs2_dir_release, .open = ocfs2_dir_open, diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 88924a3133fa..621fc73bf23d 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -147,8 +147,6 @@ void ocfs2_refresh_inode(struct inode *inode, int ocfs2_mark_inode_dirty(handle_t *handle, struct inode *inode, struct buffer_head *bh); -int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); -int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); struct buffer_head *ocfs2_bread(struct inode *inode, int block, int *err, int reada); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 752f0b26221d..0c60ef2d8056 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -101,13 +101,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, if (!S_ISDIR(inode->i_mode)) flags &= ~OCFS2_DIRSYNC_FL; - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto bail_unlock; - } - oldflags = ocfs2_inode->ip_attr; flags = flags & mask; flags |= oldflags & ~mask; @@ -120,7 +113,14 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) & (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) { if (!capable(CAP_LINUX_IMMUTABLE)) - goto bail_commit; + goto bail_unlock; + } + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; } ocfs2_inode->ip_attr = flags; @@ -130,8 +130,8 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, if (status < 0) mlog_errno(status); -bail_commit: ocfs2_commit_trans(osb, handle); + bail_unlock: ocfs2_inode_unlock(inode, 1); bail: @@ -706,8 +706,10 @@ int ocfs2_info_handle_freefrag(struct inode *inode, o2info_set_request_filled(&oiff->iff_req); - if (o2info_to_user(*oiff, req)) + if (o2info_to_user(*oiff, req)) { + status = -EFAULT; goto bail; + } status = 0; bail: diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 8eccfabcd12e..242170d83971 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1941,6 +1941,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb) } struct ocfs2_orphan_filldir_priv { + struct dir_context ctx; struct inode *head; struct ocfs2_super *osb; }; @@ -1977,11 +1978,11 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, { int status; struct inode *orphan_dir_inode = NULL; - struct ocfs2_orphan_filldir_priv priv; - loff_t pos = 0; - - priv.osb = osb; - priv.head = *head; + struct ocfs2_orphan_filldir_priv priv = { + .ctx.actor = ocfs2_orphan_filldir, + .osb = osb, + .head = *head + }; orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, @@ -1999,8 +2000,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, goto out; } - status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv, - ocfs2_orphan_filldir); + status = ocfs2_dir_foreach(orphan_dir_inode, &priv.ctx); if (status) { mlog_errno(status); goto out_cluster; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index a3385b63ff5e..96f9ac237e86 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -200,7 +200,6 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb); static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) { - atomic_set(&osb->needs_checkpoint, 1); wake_up(&osb->checkpoint_event); } @@ -538,7 +537,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb, extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks + - ocfs2_quota_trans_credits(sb); + ocfs2_quota_trans_credits(sb) + bits_wanted; } static inline int ocfs2_calc_symlink_credits(struct super_block *sb) diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 9f8dcadd9a50..f1fc172175b6 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -471,7 +471,7 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, int ret, goal_bit = 0; struct buffer_head *gd_bh = NULL; - struct ocfs2_group_desc *bg = NULL; + struct ocfs2_group_desc *bg; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int c_to_b = 1 << (osb->s_clustersize_bits - inode->i_sb->s_blocksize_bits); @@ -482,13 +482,6 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb, range->me_goal); /* - * moving goal is not allowd to start with a group desc blok(#0 blk) - * let's compromise to the latter cluster. - */ - if (range->me_goal == le64_to_cpu(bg->bg_blkno)) - range->me_goal += c_to_b; - - /* * validate goal sits within global_bitmap, and return the victim * group desc */ @@ -502,6 +495,13 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, bg = (struct ocfs2_group_desc *)gd_bh->b_data; /* + * moving goal is not allowd to start with a group desc blok(#0 blk) + * let's compromise to the latter cluster. + */ + if (range->me_goal == le64_to_cpu(bg->bg_blkno)) + range->me_goal += c_to_b; + + /* * movement is not gonna cross two groups. */ if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < @@ -1057,42 +1057,40 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) struct inode *inode = file_inode(filp); struct ocfs2_move_extents range; - struct ocfs2_move_extents_context *context = NULL; + struct ocfs2_move_extents_context *context; + + if (!argp) + return -EINVAL; status = mnt_want_write_file(filp); if (status) return status; if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) - goto out; + goto out_drop; if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { status = -EPERM; - goto out; + goto out_drop; } context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); if (!context) { status = -ENOMEM; mlog_errno(status); - goto out; + goto out_drop; } context->inode = inode; context->file = filp; - if (argp) { - if (copy_from_user(&range, argp, sizeof(range))) { - status = -EFAULT; - goto out; - } - } else { - status = -EINVAL; - goto out; + if (copy_from_user(&range, argp, sizeof(range))) { + status = -EFAULT; + goto out_free; } if (range.me_start > i_size_read(inode)) - goto out; + goto out_free; if (range.me_start + range.me_len > i_size_read(inode)) range.me_len = i_size_read(inode) - range.me_start; @@ -1124,25 +1122,24 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) status = ocfs2_validate_and_adjust_move_goal(inode, &range); if (status) - goto out; + goto out_copy; } status = ocfs2_move_extents(context); if (status) mlog_errno(status); -out: +out_copy: /* * movement/defragmentation may end up being partially completed, * that's the reason why we need to return userspace the finished * length and new_offset even if failure happens somewhere. */ - if (argp) { - if (copy_to_user(argp, &range, sizeof(range))) - status = -EFAULT; - } + if (copy_to_user(argp, &range, sizeof(range))) + status = -EFAULT; +out_free: kfree(context); - +out_drop: mnt_drop_write_file(filp); return status; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 04ee1b57c243..be3f8676a438 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -522,7 +522,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, fe->i_last_eb_blk = 0; strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); - le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL); + fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL); fe->i_atime = fe->i_ctime = fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec); fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = @@ -773,7 +773,7 @@ static int ocfs2_remote_dentry_delete(struct dentry *dentry) return ret; } -static inline int inode_is_unlinkable(struct inode *inode) +static inline int ocfs2_inode_is_unlinkable(struct inode *inode) { if (S_ISDIR(inode->i_mode)) { if (inode->i_nlink == 2) @@ -791,6 +791,7 @@ static int ocfs2_unlink(struct inode *dir, { int status; int child_locked = 0; + bool is_unlinkable = false; struct inode *inode = dentry->d_inode; struct inode *orphan_dir = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); @@ -865,7 +866,7 @@ static int ocfs2_unlink(struct inode *dir, goto leave; } - if (inode_is_unlinkable(inode)) { + if (ocfs2_inode_is_unlinkable(inode)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, OCFS2_I(inode)->ip_blkno, orphan_name, &orphan_insert); @@ -873,6 +874,7 @@ static int ocfs2_unlink(struct inode *dir, mlog_errno(status); goto leave; } + is_unlinkable = true; } handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb)); @@ -892,15 +894,6 @@ static int ocfs2_unlink(struct inode *dir, fe = (struct ocfs2_dinode *) fe_bh->b_data; - if (inode_is_unlinkable(inode)) { - status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name, - &orphan_insert, orphan_dir); - if (status < 0) { - mlog_errno(status); - goto leave; - } - } - /* delete the name from the parent dir */ status = ocfs2_delete_entry(handle, dir, &lookup); if (status < 0) { @@ -923,6 +916,14 @@ static int ocfs2_unlink(struct inode *dir, mlog_errno(status); if (S_ISDIR(inode->i_mode)) inc_nlink(dir); + goto leave; + } + + if (is_unlinkable) { + status = ocfs2_orphan_add(osb, handle, inode, fe_bh, + orphan_name, &orphan_insert, orphan_dir); + if (status < 0) + mlog_errno(status); } leave: @@ -947,7 +948,7 @@ leave: ocfs2_free_dir_lookup_result(&orphan_insert); ocfs2_free_dir_lookup_result(&lookup); - if (status) + if (status && (status != -ENOTEMPTY)) mlog_errno(status); return status; @@ -2012,6 +2013,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, goto leave; } + /* + * We're going to journal the change of i_flags and i_orphaned_slot. + * It's safe anyway, though some callers may duplicate the journaling. + * Journaling within the func just make the logic look more + * straightforward. + */ + status = ocfs2_journal_access_di(handle, + INODE_CACHE(inode), + fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + /* we're a cluster, and nlink can change on disk from * underneath us... */ orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; @@ -2026,25 +2042,10 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, orphan_dir_bh, lookup); if (status < 0) { mlog_errno(status); - goto leave; + goto rollback; } - /* - * We're going to journal the change of i_flags and i_orphaned_slot. - * It's safe anyway, though some callers may duplicate the journaling. - * Journaling within the func just make the logic look more - * straightforward. - */ - status = ocfs2_journal_access_di(handle, - INODE_CACHE(inode), - fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); + fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; /* Record which orphan dir our inode now resides @@ -2057,11 +2058,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); +rollback: + if (status < 0) { + if (S_ISDIR(inode->i_mode)) + ocfs2_add_links_count(orphan_fe, -1); + set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe)); + } + leave: brelse(orphan_dir_bh); - if (status) - mlog_errno(status); return status; } @@ -2216,7 +2222,7 @@ out: brelse(orphan_dir_bh); - return 0; + return ret; } int ocfs2_create_inode_in_orphan(struct inode *dir, @@ -2434,7 +2440,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, } di = (struct ocfs2_dinode *)di_bh->b_data; - le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); + di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL); di->i_orphaned_slot = 0; set_nlink(inode, 1); ocfs2_set_links_count(di, inode->i_nlink); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index d355e6e36b36..3a903470c794 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -347,7 +347,6 @@ struct ocfs2_super struct task_struct *recovery_thread_task; int disable_recovery; wait_queue_head_t checkpoint_event; - atomic_t needs_checkpoint; struct ocfs2_journal *journal; unsigned long osb_commit_interval; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index b7e74b580c0f..5397c07ce608 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1422,7 +1422,7 @@ static int ocfs2_relink_block_group(handle_t *handle, int status; /* there is a really tiny chance the journal calls could fail, * but we wouldn't want inconsistent blocks in *any* case. */ - u64 fe_ptr, bg_ptr, prev_bg_ptr; + u64 bg_ptr, prev_bg_ptr; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; @@ -1437,51 +1437,44 @@ static int ocfs2_relink_block_group(handle_t *handle, (unsigned long long)le64_to_cpu(bg->bg_blkno), (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); - fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno); bg_ptr = le64_to_cpu(bg->bg_next_group); prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), prev_bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } + if (status < 0) + goto out; prev_bg->bg_next_group = bg->bg_next_group; ocfs2_journal_dirty(handle, prev_bg_bh); status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } + if (status < 0) + goto out_rollback_prev_bg; bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; ocfs2_journal_dirty(handle, bg_bh); status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } + if (status < 0) + goto out_rollback_bg; fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; ocfs2_journal_dirty(handle, fe_bh); -out_rollback: - if (status < 0) { - fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); - bg->bg_next_group = cpu_to_le64(bg_ptr); - prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); - } - - if (status) +out: + if (status < 0) mlog_errno(status); return status; + +out_rollback_bg: + bg->bg_next_group = cpu_to_le64(bg_ptr); +out_rollback_prev_bg: + prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); + goto out; } static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 01b85165552b..854d80955bf8 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -286,10 +286,9 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) spin_unlock(&osb->osb_lock); out += snprintf(buf + out, len - out, - "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit", + "%10s => Pid: %d Interval: %lu\n", "Commit", (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), - osb->osb_commit_interval, - atomic_read(&osb->needs_checkpoint)); + osb->osb_commit_interval); out += snprintf(buf + out, len - out, "%10s => State: %d TxnId: %lu NumTxns: %d\n", @@ -2154,7 +2153,6 @@ static int ocfs2_initialize_super(struct super_block *sb, } init_waitqueue_head(&osb->checkpoint_event); - atomic_set(&osb->needs_checkpoint, 0); osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 2e3ea308c144..317ef0abccbb 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2751,7 +2751,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode, { int ret; struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; struct ocfs2_xa_loc loc; if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) @@ -2759,13 +2758,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode, down_write(&oi->ip_alloc_sem); if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { - if (!ocfs2_xattr_has_space_inline(inode, di)) { - ret = -ENOSPC; - goto out; - } - } - - if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt); if (ret) { if (ret != -ENOSPC) @@ -6499,6 +6491,16 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args) } new_oi = OCFS2_I(args->new_inode); + /* + * Adjust extent record count to reserve space for extended attribute. + * Inline data count had been adjusted in ocfs2_duplicate_inline_data(). + */ + if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) && + !(ocfs2_inode_is_fast_symlink(args->new_inode))) { + struct ocfs2_extent_list *el = &new_di->id2.i_list; + le16_add_cpu(&el->l_count, -(inline_size / + sizeof(struct ocfs2_extent_rec))); + } spin_lock(&new_oi->ip_lock); new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL; new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index acbaebcad3a8..1b8e9e8405b2 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -327,26 +327,23 @@ int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, return is_bad; } -static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir, +static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx, u64 fsblock, int hindex) { - struct inode *dir = file_inode(filp); - struct buffer_head *bh; - struct omfs_inode *oi; - u64 self; - int res = 0; - unsigned char d_type; - /* follow chain in this bucket */ while (fsblock != ~0) { - bh = omfs_bread(dir->i_sb, fsblock); + struct buffer_head *bh = omfs_bread(dir->i_sb, fsblock); + struct omfs_inode *oi; + u64 self; + unsigned char d_type; + if (!bh) - goto out; + return true; oi = (struct omfs_inode *) bh->b_data; if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) { brelse(bh); - goto out; + return true; } self = fsblock; @@ -361,15 +358,16 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir, d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG; - res = filldir(dirent, oi->i_name, strnlen(oi->i_name, - OMFS_NAMELEN), filp->f_pos, self, d_type); + if (!dir_emit(ctx, oi->i_name, + strnlen(oi->i_name, OMFS_NAMELEN), + self, d_type)) { + brelse(bh); + return false; + } brelse(bh); - if (res < 0) - break; - filp->f_pos++; + ctx->pos++; } -out: - return res; + return true; } static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, @@ -403,60 +401,44 @@ out: return err; } -static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int omfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *dir = file_inode(filp); + struct inode *dir = file_inode(file); struct buffer_head *bh; - loff_t offset, res; + __be64 *p; unsigned int hchain, hindex; int nbuckets; - u64 fsblock; - int ret = -EINVAL; - - if (filp->f_pos >> 32) - goto success; - - switch ((unsigned long) filp->f_pos) { - case 0: - if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0) - goto success; - filp->f_pos++; - /* fall through */ - case 1: - if (filldir(dirent, "..", 2, 1, - parent_ino(filp->f_dentry), DT_DIR) < 0) - goto success; - filp->f_pos = 1 << 20; - /* fall through */ + + if (ctx->pos >> 32) + return -EINVAL; + + if (ctx->pos < 1 << 20) { + if (!dir_emit_dots(file, ctx)) + return 0; + ctx->pos = 1 << 20; } nbuckets = (dir->i_size - OMFS_DIR_START) / 8; /* high 12 bits store bucket + 1 and low 20 bits store hash index */ - hchain = (filp->f_pos >> 20) - 1; - hindex = filp->f_pos & 0xfffff; + hchain = (ctx->pos >> 20) - 1; + hindex = ctx->pos & 0xfffff; bh = omfs_bread(dir->i_sb, dir->i_ino); if (!bh) - goto out; + return -EINVAL; - offset = OMFS_DIR_START + hchain * 8; + p = (__be64 *)(bh->b_data + OMFS_DIR_START) + hchain; - for (; hchain < nbuckets; hchain++, offset += 8) { - fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset])); - - res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex); - hindex = 0; - if (res < 0) + for (; hchain < nbuckets; hchain++) { + __u64 fsblock = be64_to_cpu(*p++); + if (!omfs_fill_chain(dir, ctx, fsblock, hindex)) break; - - filp->f_pos = (hchain+2) << 20; + hindex = 0; + ctx->pos = (hchain+2) << 20; } brelse(bh); -success: - ret = 0; -out: - return ret; + return 0; } const struct inode_operations omfs_dir_inops = { @@ -470,6 +452,6 @@ const struct inode_operations omfs_dir_inops = { const struct file_operations omfs_dir_operations = { .read = generic_read_dir, - .readdir = omfs_readdir, + .iterate = omfs_readdir, .llseek = generic_file_llseek, }; diff --git a/fs/open.c b/fs/open.c index 68354466879f..fca72c4d3f17 100644 --- a/fs/open.c +++ b/fs/open.c @@ -197,10 +197,7 @@ out: SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) { - long ret = do_sys_ftruncate(fd, length, 1); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, fd, length); - return ret; + return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT @@ -212,32 +209,15 @@ COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) /* LFS versions of truncate are only needed on 32 bit machines */ #if BITS_PER_LONG == 32 -SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length) +SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length) { return do_sys_truncate(path, length); } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_truncate64(long path, loff_t length) -{ - return SYSC_truncate64((const char __user *) path, length); -} -SYSCALL_ALIAS(sys_truncate64, SyS_truncate64); -#endif -SYSCALL_DEFINE(ftruncate64)(unsigned int fd, loff_t length) -{ - long ret = do_sys_ftruncate(fd, length, 0); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, fd, length); - return ret; -} -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_ftruncate64(long fd, loff_t length) +SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) { - return SYSC_ftruncate64((unsigned int) fd, length); + return do_sys_ftruncate(fd, length, 0); } -SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64); -#endif #endif /* BITS_PER_LONG == 32 */ @@ -299,7 +279,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return ret; } -SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) +SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) { struct fd f = fdget(fd); int error = -EBADF; @@ -311,14 +291,6 @@ SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) return error; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len) -{ - return SYSC_fallocate((int)fd, (int)mode, offset, len); -} -SYSCALL_ALIAS(sys_fallocate, SyS_fallocate); -#endif - /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and @@ -868,11 +840,15 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o if (flags & __O_SYNC) flags |= O_DSYNC; - /* - * If we have O_PATH in the open flag. Then we - * cannot have anything other than the below set of flags - */ - if (flags & O_PATH) { + if (flags & O_TMPFILE) { + if (!(flags & O_CREAT)) + return -EINVAL; + acc_mode = MAY_OPEN | ACC_MODE(flags); + } else if (flags & O_PATH) { + /* + * If we have O_PATH in the open flag. Then we + * cannot have anything other than the below set of flags + */ flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; acc_mode = 0; } else { @@ -904,7 +880,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o lookup_flags |= LOOKUP_DIRECTORY; if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; - return lookup_flags; + op->lookup_flags = lookup_flags; + return 0; } /** @@ -921,8 +898,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o struct file *file_open_name(struct filename *name, int flags, umode_t mode) { struct open_flags op; - int lookup = build_open_flags(flags, mode, &op); - return do_filp_open(AT_FDCWD, name, &op, lookup); + int err = build_open_flags(flags, mode, &op); + return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op); } /** @@ -947,65 +924,61 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, const char *filename, int flags) { struct open_flags op; - int lookup = build_open_flags(flags, 0, &op); + int err = build_open_flags(flags, 0, &op); + if (err) + return ERR_PTR(err); if (flags & O_CREAT) return ERR_PTR(-EINVAL); if (!filename && (flags & O_DIRECTORY)) if (!dentry->d_inode->i_op->lookup) return ERR_PTR(-ENOTDIR); - return do_file_open_root(dentry, mnt, filename, &op, lookup); + return do_file_open_root(dentry, mnt, filename, &op); } EXPORT_SYMBOL(file_open_root); long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_flags op; - int lookup = build_open_flags(flags, mode, &op); - struct filename *tmp = getname(filename); - int fd = PTR_ERR(tmp); - - if (!IS_ERR(tmp)) { - fd = get_unused_fd_flags(flags); - if (fd >= 0) { - struct file *f = do_filp_open(dfd, tmp, &op, lookup); - if (IS_ERR(f)) { - put_unused_fd(fd); - fd = PTR_ERR(f); - } else { - fsnotify_open(f); - fd_install(fd, f); - } + int fd = build_open_flags(flags, mode, &op); + struct filename *tmp; + + if (fd) + return fd; + + tmp = getname(filename); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + fd = get_unused_fd_flags(flags); + if (fd >= 0) { + struct file *f = do_filp_open(dfd, tmp, &op); + if (IS_ERR(f)) { + put_unused_fd(fd); + fd = PTR_ERR(f); + } else { + fsnotify_open(f); + fd_install(fd, f); } - putname(tmp); } + putname(tmp); return fd; } SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { - long ret; - if (force_o_largefile()) flags |= O_LARGEFILE; - ret = do_sys_open(AT_FDCWD, filename, flags, mode); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, filename, flags, mode); - return ret; + return do_sys_open(AT_FDCWD, filename, flags, mode); } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { - long ret; - if (force_o_largefile()) flags |= O_LARGEFILE; - ret = do_sys_open(dfd, filename, flags, mode); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(4, ret, dfd, filename, flags, mode); - return ret; + return do_sys_open(dfd, filename, flags, mode); } #ifndef __alpha__ diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 75885ffde44e..8c0ceb8dd1f7 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -162,11 +162,11 @@ static const struct file_operations openpromfs_prop_ops = { .release = seq_release, }; -static int openpromfs_readdir(struct file *, void *, filldir_t); +static int openpromfs_readdir(struct file *, struct dir_context *); static const struct file_operations openprom_operations = { .read = generic_read_dir, - .readdir = openpromfs_readdir, + .iterate = openpromfs_readdir, .llseek = generic_file_llseek, }; @@ -260,71 +260,64 @@ found: return NULL; } -static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir) +static int openpromfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct op_inode_info *oi = OP_I(inode); struct device_node *dp = oi->u.node; struct device_node *child; struct property *prop; - unsigned int ino; int i; mutex_lock(&op_mutex); - ino = inode->i_ino; - i = filp->f_pos; - switch (i) { - case 0: - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + if (ctx->pos == 0) { + if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR)) goto out; - i++; - filp->f_pos++; - /* fall thru */ - case 1: - if (filldir(dirent, "..", 2, i, + ctx->pos = 1; + } + if (ctx->pos == 1) { + if (!dir_emit(ctx, "..", 2, (dp->parent == NULL ? OPENPROM_ROOT_INO : - dp->parent->unique_id), DT_DIR) < 0) + dp->parent->unique_id), DT_DIR)) goto out; - i++; - filp->f_pos++; - /* fall thru */ - default: - i -= 2; - - /* First, the children nodes as directories. */ - child = dp->child; - while (i && child) { - child = child->sibling; - i--; - } - while (child) { - if (filldir(dirent, - child->path_component_name, - strlen(child->path_component_name), - filp->f_pos, child->unique_id, DT_DIR) < 0) - goto out; - - filp->f_pos++; - child = child->sibling; - } + ctx->pos = 2; + } + i = ctx->pos - 2; - /* Next, the properties as files. */ - prop = dp->properties; - while (i && prop) { - prop = prop->next; - i--; - } - while (prop) { - if (filldir(dirent, prop->name, strlen(prop->name), - filp->f_pos, prop->unique_id, DT_REG) < 0) - goto out; + /* First, the children nodes as directories. */ + child = dp->child; + while (i && child) { + child = child->sibling; + i--; + } + while (child) { + if (!dir_emit(ctx, + child->path_component_name, + strlen(child->path_component_name), + child->unique_id, DT_DIR)) + goto out; - filp->f_pos++; - prop = prop->next; - } + ctx->pos++; + child = child->sibling; + } + + /* Next, the properties as files. */ + prop = dp->properties; + while (i && prop) { + prop = prop->next; + i--; } + while (prop) { + if (!dir_emit(ctx, prop->name, strlen(prop->name), + prop->unique_id, DT_REG)) + goto out; + + ctx->pos++; + prop = prop->next; + } + out: mutex_unlock(&op_mutex); return 0; diff --git a/fs/pipe.c b/fs/pipe.c index 2234f3f61f8d..d2c45e14e6d8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,10 +21,13 @@ #include <linux/audit.h> #include <linux/syscalls.h> #include <linux/fcntl.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include <asm/ioctls.h> +#include "internal.h" + /* * The max size that a non-root user is allowed to grow the pipe. Can * be set by root in /proc/sys/fs/pipe-max-size @@ -53,8 +56,8 @@ unsigned int pipe_min_size = PAGE_SIZE; static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) { - if (pipe->inode) - mutex_lock_nested(&pipe->inode->i_mutex, subclass); + if (pipe->files) + mutex_lock_nested(&pipe->mutex, subclass); } void pipe_lock(struct pipe_inode_info *pipe) @@ -68,11 +71,21 @@ EXPORT_SYMBOL(pipe_lock); void pipe_unlock(struct pipe_inode_info *pipe) { - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + if (pipe->files) + mutex_unlock(&pipe->mutex); } EXPORT_SYMBOL(pipe_unlock); +static inline void __pipe_lock(struct pipe_inode_info *pipe) +{ + mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); +} + +static inline void __pipe_unlock(struct pipe_inode_info *pipe) +{ + mutex_unlock(&pipe->mutex); +} + void pipe_double_lock(struct pipe_inode_info *pipe1, struct pipe_inode_info *pipe2) { @@ -361,8 +374,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; - struct inode *inode = file_inode(filp); - struct pipe_inode_info *pipe; + struct pipe_inode_info *pipe = filp->private_data; int do_wakeup; ssize_t ret; struct iovec *iov = (struct iovec *)_iov; @@ -375,8 +387,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(&inode->i_mutex); - pipe = inode->i_pipe; + __pipe_lock(pipe); for (;;) { int bufs = pipe->nrbufs; if (bufs) { @@ -464,7 +475,7 @@ redo: } pipe_wait(pipe); } - mutex_unlock(&inode->i_mutex); + __pipe_unlock(pipe); /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { @@ -486,8 +497,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, unsigned long nr_segs, loff_t ppos) { struct file *filp = iocb->ki_filp; - struct inode *inode = file_inode(filp); - struct pipe_inode_info *pipe; + struct pipe_inode_info *pipe = filp->private_data; ssize_t ret; int do_wakeup; struct iovec *iov = (struct iovec *)_iov; @@ -501,8 +511,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(&inode->i_mutex); - pipe = inode->i_pipe; + __pipe_lock(pipe); if (!pipe->readers) { send_sig(SIGPIPE, current, 0); @@ -649,7 +658,7 @@ redo2: pipe->waiting_writers--; } out: - mutex_unlock(&inode->i_mutex); + __pipe_unlock(pipe); if (do_wakeup) { wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); @@ -662,29 +671,14 @@ out: return ret; } -static ssize_t -bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) -{ - return -EBADF; -} - -static ssize_t -bad_pipe_w(struct file *filp, const char __user *buf, size_t count, - loff_t *ppos) -{ - return -EBADF; -} - static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = file_inode(filp); - struct pipe_inode_info *pipe; + struct pipe_inode_info *pipe = filp->private_data; int count, buf, nrbufs; switch (cmd) { case FIONREAD: - mutex_lock(&inode->i_mutex); - pipe = inode->i_pipe; + __pipe_lock(pipe); count = 0; buf = pipe->curbuf; nrbufs = pipe->nrbufs; @@ -692,7 +686,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) count += pipe->bufs[buf].len; buf = (buf+1) & (pipe->buffers - 1); } - mutex_unlock(&inode->i_mutex); + __pipe_unlock(pipe); return put_user(count, (int __user *)arg); default: @@ -705,8 +699,7 @@ static unsigned int pipe_poll(struct file *filp, poll_table *wait) { unsigned int mask; - struct inode *inode = file_inode(filp); - struct pipe_inode_info *pipe = inode->i_pipe; + struct pipe_inode_info *pipe = filp->private_data; int nrbufs; poll_wait(filp, &pipe->wait, wait); @@ -734,197 +727,56 @@ pipe_poll(struct file *filp, poll_table *wait) } static int -pipe_release(struct inode *inode, int decr, int decw) +pipe_release(struct inode *inode, struct file *file) { - struct pipe_inode_info *pipe; + struct pipe_inode_info *pipe = inode->i_pipe; + int kill = 0; - mutex_lock(&inode->i_mutex); - pipe = inode->i_pipe; - pipe->readers -= decr; - pipe->writers -= decw; + __pipe_lock(pipe); + if (file->f_mode & FMODE_READ) + pipe->readers--; + if (file->f_mode & FMODE_WRITE) + pipe->writers--; - if (!pipe->readers && !pipe->writers) { - free_pipe_info(inode); - } else { + if (pipe->readers || pipe->writers) { wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - mutex_unlock(&inode->i_mutex); - - return 0; -} - -static int -pipe_read_fasync(int fd, struct file *filp, int on) -{ - struct inode *inode = file_inode(filp); - int retval; - - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); - mutex_unlock(&inode->i_mutex); - - return retval; -} - - -static int -pipe_write_fasync(int fd, struct file *filp, int on) -{ - struct inode *inode = file_inode(filp); - int retval; + spin_lock(&inode->i_lock); + if (!--pipe->files) { + inode->i_pipe = NULL; + kill = 1; + } + spin_unlock(&inode->i_lock); + __pipe_unlock(pipe); - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); - mutex_unlock(&inode->i_mutex); + if (kill) + free_pipe_info(pipe); - return retval; + return 0; } - static int -pipe_rdwr_fasync(int fd, struct file *filp, int on) +pipe_fasync(int fd, struct file *filp, int on) { - struct inode *inode = file_inode(filp); - struct pipe_inode_info *pipe = inode->i_pipe; - int retval; + struct pipe_inode_info *pipe = filp->private_data; + int retval = 0; - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); - if (retval >= 0) { + __pipe_lock(pipe); + if (filp->f_mode & FMODE_READ) + retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); + if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); - if (retval < 0) /* this can happen only if on == T */ + if (retval < 0 && (filp->f_mode & FMODE_READ)) + /* this can happen only if on == T */ fasync_helper(-1, filp, 0, &pipe->fasync_readers); } - mutex_unlock(&inode->i_mutex); + __pipe_unlock(pipe); return retval; } - -static int -pipe_read_release(struct inode *inode, struct file *filp) -{ - return pipe_release(inode, 1, 0); -} - -static int -pipe_write_release(struct inode *inode, struct file *filp) -{ - return pipe_release(inode, 0, 1); -} - -static int -pipe_rdwr_release(struct inode *inode, struct file *filp) -{ - int decr, decw; - - decr = (filp->f_mode & FMODE_READ) != 0; - decw = (filp->f_mode & FMODE_WRITE) != 0; - return pipe_release(inode, decr, decw); -} - -static int -pipe_read_open(struct inode *inode, struct file *filp) -{ - int ret = -ENOENT; - - mutex_lock(&inode->i_mutex); - - if (inode->i_pipe) { - ret = 0; - inode->i_pipe->readers++; - } - - mutex_unlock(&inode->i_mutex); - - return ret; -} - -static int -pipe_write_open(struct inode *inode, struct file *filp) -{ - int ret = -ENOENT; - - mutex_lock(&inode->i_mutex); - - if (inode->i_pipe) { - ret = 0; - inode->i_pipe->writers++; - } - - mutex_unlock(&inode->i_mutex); - - return ret; -} - -static int -pipe_rdwr_open(struct inode *inode, struct file *filp) -{ - int ret = -ENOENT; - - if (!(filp->f_mode & (FMODE_READ|FMODE_WRITE))) - return -EINVAL; - - mutex_lock(&inode->i_mutex); - - if (inode->i_pipe) { - ret = 0; - if (filp->f_mode & FMODE_READ) - inode->i_pipe->readers++; - if (filp->f_mode & FMODE_WRITE) - inode->i_pipe->writers++; - } - - mutex_unlock(&inode->i_mutex); - - return ret; -} - -/* - * The file_operations structs are not static because they - * are also used in linux/fs/fifo.c to do operations on FIFOs. - * - * Pipes reuse fifos' file_operations structs. - */ -const struct file_operations read_pipefifo_fops = { - .llseek = no_llseek, - .read = do_sync_read, - .aio_read = pipe_read, - .write = bad_pipe_w, - .poll = pipe_poll, - .unlocked_ioctl = pipe_ioctl, - .open = pipe_read_open, - .release = pipe_read_release, - .fasync = pipe_read_fasync, -}; - -const struct file_operations write_pipefifo_fops = { - .llseek = no_llseek, - .read = bad_pipe_r, - .write = do_sync_write, - .aio_write = pipe_write, - .poll = pipe_poll, - .unlocked_ioctl = pipe_ioctl, - .open = pipe_write_open, - .release = pipe_write_release, - .fasync = pipe_write_fasync, -}; - -const struct file_operations rdwr_pipefifo_fops = { - .llseek = no_llseek, - .read = do_sync_read, - .aio_read = pipe_read, - .write = do_sync_write, - .aio_write = pipe_write, - .poll = pipe_poll, - .unlocked_ioctl = pipe_ioctl, - .open = pipe_rdwr_open, - .release = pipe_rdwr_release, - .fasync = pipe_rdwr_fasync, -}; - -struct pipe_inode_info * alloc_pipe_info(struct inode *inode) +struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; @@ -934,8 +786,8 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode) if (pipe->bufs) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; - pipe->inode = inode; pipe->buffers = PIPE_DEF_BUFFERS; + mutex_init(&pipe->mutex); return pipe; } kfree(pipe); @@ -944,7 +796,7 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode) return NULL; } -void __free_pipe_info(struct pipe_inode_info *pipe) +void free_pipe_info(struct pipe_inode_info *pipe) { int i; @@ -959,12 +811,6 @@ void __free_pipe_info(struct pipe_inode_info *pipe) kfree(pipe); } -void free_pipe_info(struct inode *inode) -{ - __free_pipe_info(inode->i_pipe); - inode->i_pipe = NULL; -} - static struct vfsmount *pipe_mnt __read_mostly; /* @@ -990,13 +836,14 @@ static struct inode * get_pipe_inode(void) inode->i_ino = get_next_ino(); - pipe = alloc_pipe_info(inode); + pipe = alloc_pipe_info(); if (!pipe) goto fail_iput; - inode->i_pipe = pipe; + inode->i_pipe = pipe; + pipe->files = 2; pipe->readers = pipe->writers = 1; - inode->i_fop = &rdwr_pipefifo_fops; + inode->i_fop = &pipefifo_fops; /* * Mark the inode dirty from the very beginning, @@ -1039,17 +886,19 @@ int create_pipe_files(struct file **res, int flags) d_instantiate(path.dentry, inode); err = -ENFILE; - f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); + f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops); if (IS_ERR(f)) goto err_dentry; f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); + f->private_data = inode->i_pipe; - res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops); + res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops); if (IS_ERR(res[0])) goto err_file; path_get(&path); + res[0]->private_data = inode->i_pipe; res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); res[1] = f; return 0; @@ -1057,12 +906,12 @@ int create_pipe_files(struct file **res, int flags) err_file: put_filp(f); err_dentry: - free_pipe_info(inode); + free_pipe_info(inode->i_pipe); path_put(&path); return err; err_inode: - free_pipe_info(inode); + free_pipe_info(inode->i_pipe); iput(inode); return err; } @@ -1144,6 +993,168 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) return sys_pipe2(fildes, 0); } +static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) +{ + int cur = *cnt; + + while (cur == *cnt) { + pipe_wait(pipe); + if (signal_pending(current)) + break; + } + return cur == *cnt ? -ERESTARTSYS : 0; +} + +static void wake_up_partner(struct pipe_inode_info *pipe) +{ + wake_up_interruptible(&pipe->wait); +} + +static int fifo_open(struct inode *inode, struct file *filp) +{ + struct pipe_inode_info *pipe; + bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; + int kill = 0; + int ret; + + filp->f_version = 0; + + spin_lock(&inode->i_lock); + if (inode->i_pipe) { + pipe = inode->i_pipe; + pipe->files++; + spin_unlock(&inode->i_lock); + } else { + spin_unlock(&inode->i_lock); + pipe = alloc_pipe_info(); + if (!pipe) + return -ENOMEM; + pipe->files = 1; + spin_lock(&inode->i_lock); + if (unlikely(inode->i_pipe)) { + inode->i_pipe->files++; + spin_unlock(&inode->i_lock); + free_pipe_info(pipe); + pipe = inode->i_pipe; + } else { + inode->i_pipe = pipe; + spin_unlock(&inode->i_lock); + } + } + filp->private_data = pipe; + /* OK, we have a pipe and it's pinned down */ + + __pipe_lock(pipe); + + /* We can only do regular read/write on fifos */ + filp->f_mode &= (FMODE_READ | FMODE_WRITE); + + switch (filp->f_mode) { + case FMODE_READ: + /* + * O_RDONLY + * POSIX.1 says that O_NONBLOCK means return with the FIFO + * opened, even when there is no process writing the FIFO. + */ + pipe->r_counter++; + if (pipe->readers++ == 0) + wake_up_partner(pipe); + + if (!is_pipe && !pipe->writers) { + if ((filp->f_flags & O_NONBLOCK)) { + /* suppress POLLHUP until we have + * seen a writer */ + filp->f_version = pipe->w_counter; + } else { + if (wait_for_partner(pipe, &pipe->w_counter)) + goto err_rd; + } + } + break; + + case FMODE_WRITE: + /* + * O_WRONLY + * POSIX.1 says that O_NONBLOCK means return -1 with + * errno=ENXIO when there is no process reading the FIFO. + */ + ret = -ENXIO; + if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) + goto err; + + pipe->w_counter++; + if (!pipe->writers++) + wake_up_partner(pipe); + + if (!is_pipe && !pipe->readers) { + if (wait_for_partner(pipe, &pipe->r_counter)) + goto err_wr; + } + break; + + case FMODE_READ | FMODE_WRITE: + /* + * O_RDWR + * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. + * This implementation will NEVER block on a O_RDWR open, since + * the process can at least talk to itself. + */ + + pipe->readers++; + pipe->writers++; + pipe->r_counter++; + pipe->w_counter++; + if (pipe->readers == 1 || pipe->writers == 1) + wake_up_partner(pipe); + break; + + default: + ret = -EINVAL; + goto err; + } + + /* Ok! */ + __pipe_unlock(pipe); + return 0; + +err_rd: + if (!--pipe->readers) + wake_up_interruptible(&pipe->wait); + ret = -ERESTARTSYS; + goto err; + +err_wr: + if (!--pipe->writers) + wake_up_interruptible(&pipe->wait); + ret = -ERESTARTSYS; + goto err; + +err: + spin_lock(&inode->i_lock); + if (!--pipe->files) { + inode->i_pipe = NULL; + kill = 1; + } + spin_unlock(&inode->i_lock); + __pipe_unlock(pipe); + if (kill) + free_pipe_info(pipe); + return ret; +} + +const struct file_operations pipefifo_fops = { + .open = fifo_open, + .llseek = no_llseek, + .read = do_sync_read, + .aio_read = pipe_read, + .write = do_sync_write, + .aio_write = pipe_write, + .poll = pipe_poll, + .unlocked_ioctl = pipe_ioctl, + .release = pipe_release, + .fasync = pipe_fasync, +}; + /* * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. @@ -1229,9 +1240,7 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, */ struct pipe_inode_info *get_pipe_info(struct file *file) { - struct inode *i = file_inode(file); - - return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; + return file->f_op == &pipefifo_fops ? file->private_data : NULL; } long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) @@ -1243,7 +1252,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (!pipe) return -EBADF; - mutex_lock(&pipe->inode->i_mutex); + __pipe_lock(pipe); switch (cmd) { case F_SETPIPE_SZ: { @@ -1272,7 +1281,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) } out: - mutex_unlock(&pipe->inode->i_mutex); + __pipe_unlock(pipe); return ret; } diff --git a/fs/pnode.c b/fs/pnode.c index 8b29d2164da6..9af0df15256e 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -83,7 +83,8 @@ static int do_make_slave(struct mount *mnt) if (peer_mnt == mnt) peer_mnt = NULL; } - if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share)) + if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) && + list_empty(&mnt->mnt_share)) mnt_release_group_id(mnt); list_del_init(&mnt->mnt_share); @@ -218,7 +219,7 @@ static struct mount *get_source(struct mount *dest, * @source_mnt: source mount. * @tree_list : list of heads of trees to be attached. */ -int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, +int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, struct mount *source_mnt, struct list_head *tree_list) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; @@ -227,7 +228,6 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, struct mount *prev_dest_mnt = dest_mnt; struct mount *prev_src_mnt = source_mnt; LIST_HEAD(tmp_list); - LIST_HEAD(umount_list); for (m = propagation_next(dest_mnt, dest_mnt); m; m = propagation_next(m, dest_mnt)) { @@ -250,8 +250,8 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, goto out; } - if (is_subdir(dest_dentry, m->mnt.mnt_root)) { - mnt_set_mountpoint(m, dest_dentry, child); + if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) { + mnt_set_mountpoint(m, dest_mp, child); list_add_tail(&child->mnt_hash, tree_list); } else { /* @@ -267,10 +267,9 @@ out: br_write_lock(&vfsmount_lock); while (!list_empty(&tmp_list)) { child = list_first_entry(&tmp_list, struct mount, mnt_hash); - umount_tree(child, 0, &umount_list); + umount_tree(child, 0); } br_write_unlock(&vfsmount_lock); - release_mounts(&umount_list); return ret; } diff --git a/fs/pnode.h b/fs/pnode.h index a0493d5ebfbf..b091445c1c4a 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -32,17 +32,16 @@ static inline void set_mnt_shared(struct mount *mnt) } void change_mnt_propagation(struct mount *, int); -int propagate_mnt(struct mount *, struct dentry *, struct mount *, +int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct list_head *); int propagate_umount(struct list_head *); int propagate_mount_busy(struct mount *, int); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); unsigned int mnt_get_count(struct mount *mnt); -void mnt_set_mountpoint(struct mount *, struct dentry *, +void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); -void release_mounts(struct list_head *); -void umount_tree(struct mount *, int, struct list_head *); +void umount_tree(struct mount *, int); struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 712f24db9600..ab30716584f5 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -5,7 +5,7 @@ obj-y += proc.o proc-y := nommu.o task_nommu.o -proc-$(CONFIG_MMU) := mmu.o task_mmu.o +proc-$(CONFIG_MMU) := task_mmu.o proc-y += inode.o root.o base.o generic.o array.o \ fd.o diff --git a/fs/proc/base.c b/fs/proc/base.c index 69078c7cef1f..1485e38daaa3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -86,6 +86,7 @@ #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/flex_array.h> +#include <linux/posix-timers.h> #ifdef CONFIG_HARDWALL #include <asm/hardwall.h> #endif @@ -404,6 +405,37 @@ static const struct file_operations proc_lstats_operations = { #endif +#ifdef CONFIG_CGROUPS +static int cgroup_open(struct inode *inode, struct file *file) +{ + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cgroup_show, pid); +} + +static const struct file_operations proc_cgroup_operations = { + .open = cgroup_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +#ifdef CONFIG_PROC_PID_CPUSET + +static int cpuset_open(struct inode *inode, struct file *file) +{ + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cpuset_show, pid); +} + +static const struct file_operations proc_cpuset_operations = { + .open = cpuset_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static int proc_oom_score(struct task_struct *task, char *buffer) { unsigned long totalpages = totalram_pages + total_swap_pages; @@ -1347,11 +1379,10 @@ static ssize_t comm_write(struct file *file, const char __user *buf, struct inode *inode = file_inode(file); struct task_struct *p; char buffer[TASK_COMM_LEN]; + const size_t maxlen = sizeof(buffer) - 1; memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) + if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) return -EFAULT; p = get_proc_task(inode); @@ -1621,6 +1652,15 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) return 0; } +int pid_delete_dentry(const struct dentry *dentry) +{ + /* Is the task we represent dead? + * If so, then don't put the dentry on the lru list, + * kill it immediately. + */ + return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; +} + const struct dentry_operations pid_dentry_operations = { .d_revalidate = pid_revalidate, @@ -1641,46 +1681,34 @@ const struct dentry_operations pid_dentry_operations = * reported by readdir in sync with the inode numbers reported * by stat. */ -int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, +bool proc_fill_cache(struct file *file, struct dir_context *ctx, const char *name, int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { - struct dentry *child, *dir = filp->f_path.dentry; + struct dentry *child, *dir = file->f_path.dentry; + struct qstr qname = QSTR_INIT(name, len); struct inode *inode; - struct qstr qname; - ino_t ino = 0; - unsigned type = DT_UNKNOWN; - - qname.name = name; - qname.len = len; - qname.hash = full_name_hash(name, len); + unsigned type; + ino_t ino; - child = d_lookup(dir, &qname); + child = d_hash_and_lookup(dir, &qname); if (!child) { - struct dentry *new; - new = d_alloc(dir, &qname); - if (new) { - child = instantiate(dir->d_inode, new, task, ptr); - if (child) - dput(new); - else - child = new; + child = d_alloc(dir, &qname); + if (!child) + goto end_instantiate; + if (instantiate(dir->d_inode, child, task, ptr) < 0) { + dput(child); + goto end_instantiate; } } - if (!child || IS_ERR(child) || !child->d_inode) - goto end_instantiate; inode = child->d_inode; - if (inode) { - ino = inode->i_ino; - type = inode->i_mode >> 12; - } + ino = inode->i_ino; + type = inode->i_mode >> 12; dput(child); + return dir_emit(ctx, name, len, ino, type); + end_instantiate: - if (!ino) - ino = find_inode_number(dir, &qname); - if (!ino) - ino = 1; - return filldir(dirent, name, len, filp->f_pos, ino, type); + return dir_emit(ctx, name, len, 1, DT_UNKNOWN); } #ifdef CONFIG_CHECKPOINT_RESTORE @@ -1806,7 +1834,7 @@ struct map_files_info { unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; -static struct dentry * +static int proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { @@ -1816,7 +1844,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) - return ERR_PTR(-ENOENT); + return -ENOENT; ei = PROC_I(inode); ei->op.proc_get_link = proc_map_files_get_link; @@ -1833,7 +1861,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, d_set_d_op(dentry, &tid_map_files_dentry_operations); d_add(dentry, inode); - return NULL; + return 0; } static struct dentry *proc_map_files_lookup(struct inode *dir, @@ -1842,23 +1870,23 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, unsigned long vm_start, vm_end; struct vm_area_struct *vma; struct task_struct *task; - struct dentry *result; + int result; struct mm_struct *mm; - result = ERR_PTR(-EPERM); + result = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto out; - result = ERR_PTR(-ENOENT); + result = -ENOENT; task = get_proc_task(dir); if (!task) goto out; - result = ERR_PTR(-EACCES); + result = -EACCES; if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; - result = ERR_PTR(-ENOENT); + result = -ENOENT; if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) goto out_put_task; @@ -1881,7 +1909,7 @@ out_no_vma: out_put_task: put_task_struct(task); out: - return result; + return ERR_PTR(result); } static const struct inode_operations proc_map_files_inode_operations = { @@ -1891,14 +1919,15 @@ static const struct inode_operations proc_map_files_inode_operations = { }; static int -proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) +proc_map_files_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; struct vm_area_struct *vma; struct task_struct *task; struct mm_struct *mm; - ino_t ino; + unsigned long nr_files, pos, i; + struct flex_array *fa = NULL; + struct map_files_info info; + struct map_files_info *p; int ret; ret = -EPERM; @@ -1906,7 +1935,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) goto out; ret = -ENOENT; - task = get_proc_task(inode); + task = get_proc_task(file_inode(file)); if (!task) goto out; @@ -1915,91 +1944,73 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) goto out_put_task; ret = 0; - switch (filp->f_pos) { - case 0: - ino = inode->i_ino; - if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0) - goto out_put_task; - filp->f_pos++; - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out_put_task; - filp->f_pos++; - default: - { - unsigned long nr_files, pos, i; - struct flex_array *fa = NULL; - struct map_files_info info; - struct map_files_info *p; - - mm = get_task_mm(task); - if (!mm) - goto out_put_task; - down_read(&mm->mmap_sem); + if (!dir_emit_dots(file, ctx)) + goto out_put_task; - nr_files = 0; + mm = get_task_mm(task); + if (!mm) + goto out_put_task; + down_read(&mm->mmap_sem); - /* - * We need two passes here: - * - * 1) Collect vmas of mapped files with mmap_sem taken - * 2) Release mmap_sem and instantiate entries - * - * otherwise we get lockdep complained, since filldir() - * routine might require mmap_sem taken in might_fault(). - */ + nr_files = 0; - for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { - if (vma->vm_file && ++pos > filp->f_pos) - nr_files++; - } + /* + * We need two passes here: + * + * 1) Collect vmas of mapped files with mmap_sem taken + * 2) Release mmap_sem and instantiate entries + * + * otherwise we get lockdep complained, since filldir() + * routine might require mmap_sem taken in might_fault(). + */ - if (nr_files) { - fa = flex_array_alloc(sizeof(info), nr_files, - GFP_KERNEL); - if (!fa || flex_array_prealloc(fa, 0, nr_files, - GFP_KERNEL)) { - ret = -ENOMEM; - if (fa) - flex_array_free(fa); - up_read(&mm->mmap_sem); - mmput(mm); - goto out_put_task; - } - for (i = 0, vma = mm->mmap, pos = 2; vma; - vma = vma->vm_next) { - if (!vma->vm_file) - continue; - if (++pos <= filp->f_pos) - continue; - - info.mode = vma->vm_file->f_mode; - info.len = snprintf(info.name, - sizeof(info.name), "%lx-%lx", - vma->vm_start, vma->vm_end); - if (flex_array_put(fa, i++, &info, GFP_KERNEL)) - BUG(); - } + for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + if (vma->vm_file && ++pos > ctx->pos) + nr_files++; + } + + if (nr_files) { + fa = flex_array_alloc(sizeof(info), nr_files, + GFP_KERNEL); + if (!fa || flex_array_prealloc(fa, 0, nr_files, + GFP_KERNEL)) { + ret = -ENOMEM; + if (fa) + flex_array_free(fa); + up_read(&mm->mmap_sem); + mmput(mm); + goto out_put_task; } - up_read(&mm->mmap_sem); - - for (i = 0; i < nr_files; i++) { - p = flex_array_get(fa, i); - ret = proc_fill_cache(filp, dirent, filldir, - p->name, p->len, - proc_map_files_instantiate, - task, - (void *)(unsigned long)p->mode); - if (ret) - break; - filp->f_pos++; + for (i = 0, vma = mm->mmap, pos = 2; vma; + vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (++pos <= ctx->pos) + continue; + + info.mode = vma->vm_file->f_mode; + info.len = snprintf(info.name, + sizeof(info.name), "%lx-%lx", + vma->vm_start, vma->vm_end); + if (flex_array_put(fa, i++, &info, GFP_KERNEL)) + BUG(); } - if (fa) - flex_array_free(fa); - mmput(mm); } + up_read(&mm->mmap_sem); + + for (i = 0; i < nr_files; i++) { + p = flex_array_get(fa, i); + if (!proc_fill_cache(file, ctx, + p->name, p->len, + proc_map_files_instantiate, + task, + (void *)(unsigned long)p->mode)) + break; + ctx->pos++; } + if (fa) + flex_array_free(fa); + mmput(mm); out_put_task: put_task_struct(task); @@ -2009,19 +2020,115 @@ out: static const struct file_operations proc_map_files_operations = { .read = generic_read_dir, - .readdir = proc_map_files_readdir, + .iterate = proc_map_files_readdir, .llseek = default_llseek, }; +struct timers_private { + struct pid *pid; + struct task_struct *task; + struct sighand_struct *sighand; + struct pid_namespace *ns; + unsigned long flags; +}; + +static void *timers_start(struct seq_file *m, loff_t *pos) +{ + struct timers_private *tp = m->private; + + tp->task = get_pid_task(tp->pid, PIDTYPE_PID); + if (!tp->task) + return ERR_PTR(-ESRCH); + + tp->sighand = lock_task_sighand(tp->task, &tp->flags); + if (!tp->sighand) + return ERR_PTR(-ESRCH); + + return seq_list_start(&tp->task->signal->posix_timers, *pos); +} + +static void *timers_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct timers_private *tp = m->private; + return seq_list_next(v, &tp->task->signal->posix_timers, pos); +} + +static void timers_stop(struct seq_file *m, void *v) +{ + struct timers_private *tp = m->private; + + if (tp->sighand) { + unlock_task_sighand(tp->task, &tp->flags); + tp->sighand = NULL; + } + + if (tp->task) { + put_task_struct(tp->task); + tp->task = NULL; + } +} + +static int show_timer(struct seq_file *m, void *v) +{ + struct k_itimer *timer; + struct timers_private *tp = m->private; + int notify; + static char *nstr[] = { + [SIGEV_SIGNAL] = "signal", + [SIGEV_NONE] = "none", + [SIGEV_THREAD] = "thread", + }; + + timer = list_entry((struct list_head *)v, struct k_itimer, list); + notify = timer->it_sigev_notify; + + seq_printf(m, "ID: %d\n", timer->it_id); + seq_printf(m, "signal: %d/%p\n", timer->sigq->info.si_signo, + timer->sigq->info.si_value.sival_ptr); + seq_printf(m, "notify: %s/%s.%d\n", + nstr[notify & ~SIGEV_THREAD_ID], + (notify & SIGEV_THREAD_ID) ? "tid" : "pid", + pid_nr_ns(timer->it_pid, tp->ns)); + seq_printf(m, "ClockID: %d\n", timer->it_clock); + + return 0; +} + +static const struct seq_operations proc_timers_seq_ops = { + .start = timers_start, + .next = timers_next, + .stop = timers_stop, + .show = show_timer, +}; + +static int proc_timers_open(struct inode *inode, struct file *file) +{ + struct timers_private *tp; + + tp = __seq_open_private(file, &proc_timers_seq_ops, + sizeof(struct timers_private)); + if (!tp) + return -ENOMEM; + + tp->pid = proc_pid(inode); + tp->ns = inode->i_sb->s_fs_info; + return 0; +} + +static const struct file_operations proc_timers_operations = { + .open = proc_timers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; #endif /* CONFIG_CHECKPOINT_RESTORE */ -static struct dentry *proc_pident_instantiate(struct inode *dir, +static int proc_pident_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { const struct pid_entry *p = ptr; struct inode *inode; struct proc_inode *ei; - struct dentry *error = ERR_PTR(-ENOENT); inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) @@ -2040,9 +2147,9 @@ static struct dentry *proc_pident_instantiate(struct inode *dir, d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (pid_revalidate(dentry, 0)) - error = NULL; + return 0; out: - return error; + return -ENOENT; } static struct dentry *proc_pident_lookup(struct inode *dir, @@ -2050,11 +2157,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir, const struct pid_entry *ents, unsigned int nents) { - struct dentry *error; + int error; struct task_struct *task = get_proc_task(dir); const struct pid_entry *p, *last; - error = ERR_PTR(-ENOENT); + error = -ENOENT; if (!task) goto out_no_task; @@ -2077,70 +2184,33 @@ static struct dentry *proc_pident_lookup(struct inode *dir, out: put_task_struct(task); out_no_task: - return error; -} - -static int proc_pident_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, struct task_struct *task, const struct pid_entry *p) -{ - return proc_fill_cache(filp, dirent, filldir, p->name, p->len, - proc_pident_instantiate, task, p); + return ERR_PTR(error); } -static int proc_pident_readdir(struct file *filp, - void *dirent, filldir_t filldir, +static int proc_pident_readdir(struct file *file, struct dir_context *ctx, const struct pid_entry *ents, unsigned int nents) { - int i; - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); - const struct pid_entry *p, *last; - ino_t ino; - int ret; + struct task_struct *task = get_proc_task(file_inode(file)); + const struct pid_entry *p; - ret = -ENOENT; if (!task) - goto out_no_task; + return -ENOENT; - ret = 0; - i = filp->f_pos; - switch (i) { - case 0: - ino = inode->i_ino; - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - default: - i -= 2; - if (i >= nents) { - ret = 1; - goto out; - } - p = ents + i; - last = &ents[nents - 1]; - while (p <= last) { - if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0) - goto out; - filp->f_pos++; - p++; - } - } + if (!dir_emit_dots(file, ctx)) + goto out; + + if (ctx->pos >= nents + 2) + goto out; - ret = 1; + for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) { + if (!proc_fill_cache(file, ctx, p->name, p->len, + proc_pident_instantiate, task, p)) + break; + ctx->pos++; + } out: put_task_struct(task); -out_no_task: - return ret; + return 0; } #ifdef CONFIG_SECURITY @@ -2225,16 +2295,15 @@ static const struct pid_entry attr_dir_stuff[] = { REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), }; -static int proc_attr_dir_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) { - return proc_pident_readdir(filp,dirent,filldir, - attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff)); + return proc_pident_readdir(file, ctx, + attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); } static const struct file_operations proc_attr_dir_operations = { .read = generic_read_dir, - .readdir = proc_attr_dir_readdir, + .iterate = proc_attr_dir_readdir, .llseek = default_llseek, }; @@ -2583,18 +2652,20 @@ static const struct pid_entry tgid_base_stuff[] = { REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), #endif +#ifdef CONFIG_CHECKPOINT_RESTORE + REG("timers", S_IRUGO, proc_timers_operations), +#endif }; -static int proc_tgid_base_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) { - return proc_pident_readdir(filp,dirent,filldir, - tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff)); + return proc_pident_readdir(file, ctx, + tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } static const struct file_operations proc_tgid_base_operations = { .read = generic_read_dir, - .readdir = proc_tgid_base_readdir, + .iterate = proc_tgid_base_readdir, .llseek = default_llseek, }; @@ -2696,11 +2767,10 @@ void proc_flush_task(struct task_struct *task) } } -static struct dentry *proc_pid_instantiate(struct inode *dir, - struct dentry * dentry, - struct task_struct *task, const void *ptr) +static int proc_pid_instantiate(struct inode *dir, + struct dentry * dentry, + struct task_struct *task, const void *ptr) { - struct dentry *error = ERR_PTR(-ENOENT); struct inode *inode; inode = proc_pid_make_inode(dir->i_sb, task); @@ -2720,14 +2790,14 @@ static struct dentry *proc_pid_instantiate(struct inode *dir, d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (pid_revalidate(dentry, 0)) - error = NULL; + return 0; out: - return error; + return -ENOENT; } struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - struct dentry *result = NULL; + int result = 0; struct task_struct *task; unsigned tgid; struct pid_namespace *ns; @@ -2748,7 +2818,7 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign result = proc_pid_instantiate(dir, dentry, task, NULL); put_task_struct(task); out: - return result; + return ERR_PTR(result); } /* @@ -2794,52 +2864,44 @@ retry: return iter; } -#define TGID_OFFSET (FIRST_PROCESS_ENTRY) - -static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - struct tgid_iter iter) -{ - char name[PROC_NUMBUF]; - int len = snprintf(name, sizeof(name), "%d", iter.tgid); - return proc_fill_cache(filp, dirent, filldir, name, len, - proc_pid_instantiate, iter.task, NULL); -} - -static int fake_filldir(void *buf, const char *name, int namelen, - loff_t offset, u64 ino, unsigned d_type) -{ - return 0; -} +#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1) /* for the /proc/ directory itself, after non-process stuff has been done */ -int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) +int proc_pid_readdir(struct file *file, struct dir_context *ctx) { struct tgid_iter iter; - struct pid_namespace *ns; - filldir_t __filldir; + struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info; + loff_t pos = ctx->pos; - if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) - goto out; + if (pos >= PID_MAX_LIMIT + TGID_OFFSET) + return 0; - ns = filp->f_dentry->d_sb->s_fs_info; + if (pos == TGID_OFFSET - 1) { + struct inode *inode = ns->proc_self->d_inode; + if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) + return 0; + iter.tgid = 0; + } else { + iter.tgid = pos - TGID_OFFSET; + } iter.task = NULL; - iter.tgid = filp->f_pos - TGID_OFFSET; for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { - if (has_pid_permissions(ns, iter.task, 2)) - __filldir = filldir; - else - __filldir = fake_filldir; + char name[PROC_NUMBUF]; + int len; + if (!has_pid_permissions(ns, iter.task, 2)) + continue; - filp->f_pos = iter.tgid + TGID_OFFSET; - if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) { + len = snprintf(name, sizeof(name), "%d", iter.tgid); + ctx->pos = iter.tgid + TGID_OFFSET; + if (!proc_fill_cache(file, ctx, name, len, + proc_pid_instantiate, iter.task, NULL)) { put_task_struct(iter.task); - goto out; + return 0; } } - filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; -out: + ctx->pos = PID_MAX_LIMIT + TGID_OFFSET; return 0; } @@ -2927,11 +2989,10 @@ static const struct pid_entry tid_base_stuff[] = { #endif }; -static int proc_tid_base_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) { - return proc_pident_readdir(filp,dirent,filldir, - tid_base_stuff,ARRAY_SIZE(tid_base_stuff)); + return proc_pident_readdir(file, ctx, + tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); } static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -2942,7 +3003,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den static const struct file_operations proc_tid_base_operations = { .read = generic_read_dir, - .readdir = proc_tid_base_readdir, + .iterate = proc_tid_base_readdir, .llseek = default_llseek, }; @@ -2952,10 +3013,9 @@ static const struct inode_operations proc_tid_base_inode_operations = { .setattr = proc_setattr, }; -static struct dentry *proc_task_instantiate(struct inode *dir, +static int proc_task_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { - struct dentry *error = ERR_PTR(-ENOENT); struct inode *inode; inode = proc_pid_make_inode(dir->i_sb, task); @@ -2974,14 +3034,14 @@ static struct dentry *proc_task_instantiate(struct inode *dir, d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (pid_revalidate(dentry, 0)) - error = NULL; + return 0; out: - return error; + return -ENOENT; } static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - struct dentry *result = ERR_PTR(-ENOENT); + int result = -ENOENT; struct task_struct *task; struct task_struct *leader = get_proc_task(dir); unsigned tid; @@ -3011,7 +3071,7 @@ out_drop_task: out: put_task_struct(leader); out_no_task: - return result; + return ERR_PTR(result); } /* @@ -3083,30 +3143,16 @@ static struct task_struct *next_tid(struct task_struct *start) return pos; } -static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - struct task_struct *task, int tid) -{ - char name[PROC_NUMBUF]; - int len = snprintf(name, sizeof(name), "%d", tid); - return proc_fill_cache(filp, dirent, filldir, name, len, - proc_task_instantiate, task, NULL); -} - /* for the /proc/TGID/task/ directories */ -static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) +static int proc_task_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; struct task_struct *leader = NULL; - struct task_struct *task; - int retval = -ENOENT; - ino_t ino; - int tid; + struct task_struct *task = get_proc_task(file_inode(file)); struct pid_namespace *ns; + int tid; - task = get_proc_task(inode); if (!task) - goto out_no_task; + return -ENOENT; rcu_read_lock(); if (pid_alive(task)) { leader = task->group_leader; @@ -3115,46 +3161,36 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi rcu_read_unlock(); put_task_struct(task); if (!leader) - goto out_no_task; - retval = 0; + return -ENOENT; - switch ((unsigned long)filp->f_pos) { - case 0: - ino = inode->i_ino; - if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - /* fall through */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - /* fall through */ - } + if (!dir_emit_dots(file, ctx)) + goto out; /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ - ns = filp->f_dentry->d_sb->s_fs_info; - tid = (int)filp->f_version; - filp->f_version = 0; - for (task = first_tid(leader, tid, filp->f_pos - 2, ns); + ns = file->f_dentry->d_sb->s_fs_info; + tid = (int)file->f_version; + file->f_version = 0; + for (task = first_tid(leader, tid, ctx->pos - 2, ns); task; - task = next_tid(task), filp->f_pos++) { + task = next_tid(task), ctx->pos++) { + char name[PROC_NUMBUF]; + int len; tid = task_pid_nr_ns(task, ns); - if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { + len = snprintf(name, sizeof(name), "%d", tid); + if (!proc_fill_cache(file, ctx, name, len, + proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first * pid for the next readir call */ - filp->f_version = (u64)tid; + file->f_version = (u64)tid; put_task_struct(task); break; } } out: put_task_struct(leader); -out_no_task: - return retval; + return 0; } static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) @@ -3180,6 +3216,6 @@ static const struct inode_operations proc_task_inode_operations = { static const struct file_operations proc_task_operations = { .read = generic_read_dir, - .readdir = proc_task_readdir, + .iterate = proc_task_readdir, .llseek = default_llseek, }; diff --git a/fs/proc/fd.c b/fs/proc/fd.c index d7a4a28ef630..75f2890abbd8 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -167,11 +167,10 @@ static int proc_fd_link(struct dentry *dentry, struct path *path) return ret; } -static struct dentry * +static int proc_fd_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { - struct dentry *error = ERR_PTR(-ENOENT); unsigned fd = (unsigned long)ptr; struct proc_inode *ei; struct inode *inode; @@ -194,9 +193,9 @@ proc_fd_instantiate(struct inode *dir, struct dentry *dentry, /* Close the race of the process dying before we return the dentry */ if (tid_fd_revalidate(dentry, 0)) - error = NULL; + return 0; out: - return error; + return -ENOENT; } static struct dentry *proc_lookupfd_common(struct inode *dir, @@ -204,7 +203,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, instantiate_t instantiate) { struct task_struct *task = get_proc_task(dir); - struct dentry *result = ERR_PTR(-ENOENT); + int result = -ENOENT; unsigned fd = name_to_int(dentry); if (!task) @@ -216,77 +215,61 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, out: put_task_struct(task); out_no_task: - return result; + return ERR_PTR(result); } -static int proc_readfd_common(struct file * filp, void * dirent, - filldir_t filldir, instantiate_t instantiate) +static int proc_readfd_common(struct file *file, struct dir_context *ctx, + instantiate_t instantiate) { - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *p = get_proc_task(inode); + struct task_struct *p = get_proc_task(file_inode(file)); struct files_struct *files; - unsigned int fd, ino; - int retval; + unsigned int fd; - retval = -ENOENT; if (!p) - goto out_no_task; - retval = 0; - - fd = filp->f_pos; - switch (fd) { - case 0: - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - default: - files = get_files_struct(p); - if (!files) - goto out; - rcu_read_lock(); - for (fd = filp->f_pos - 2; - fd < files_fdtable(files)->max_fds; - fd++, filp->f_pos++) { - char name[PROC_NUMBUF]; - int len; - int rv; - - if (!fcheck_files(files, fd)) - continue; - rcu_read_unlock(); + return -ENOENT; - len = snprintf(name, sizeof(name), "%d", fd); - rv = proc_fill_cache(filp, dirent, filldir, - name, len, instantiate, p, - (void *)(unsigned long)fd); - if (rv < 0) - goto out_fd_loop; - rcu_read_lock(); - } - rcu_read_unlock(); -out_fd_loop: - put_files_struct(files); + if (!dir_emit_dots(file, ctx)) + goto out; + if (!dir_emit_dots(file, ctx)) + goto out; + files = get_files_struct(p); + if (!files) + goto out; + + rcu_read_lock(); + for (fd = ctx->pos - 2; + fd < files_fdtable(files)->max_fds; + fd++, ctx->pos++) { + char name[PROC_NUMBUF]; + int len; + + if (!fcheck_files(files, fd)) + continue; + rcu_read_unlock(); + + len = snprintf(name, sizeof(name), "%d", fd); + if (!proc_fill_cache(file, ctx, + name, len, instantiate, p, + (void *)(unsigned long)fd)) + goto out_fd_loop; + rcu_read_lock(); } + rcu_read_unlock(); +out_fd_loop: + put_files_struct(files); out: put_task_struct(p); -out_no_task: - return retval; + return 0; } -static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir) +static int proc_readfd(struct file *file, struct dir_context *ctx) { - return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate); + return proc_readfd_common(file, ctx, proc_fd_instantiate); } const struct file_operations proc_fd_operations = { .read = generic_read_dir, - .readdir = proc_readfd, + .iterate = proc_readfd, .llseek = default_llseek, }; @@ -316,11 +299,10 @@ const struct inode_operations proc_fd_inode_operations = { .setattr = proc_setattr, }; -static struct dentry * +static int proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { - struct dentry *error = ERR_PTR(-ENOENT); unsigned fd = (unsigned long)ptr; struct proc_inode *ei; struct inode *inode; @@ -340,9 +322,9 @@ proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, /* Close the race of the process dying before we return the dentry */ if (tid_fd_revalidate(dentry, 0)) - error = NULL; + return 0; out: - return error; + return -ENOENT; } static struct dentry * @@ -351,9 +333,9 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags) return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); } -static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir) +static int proc_readfdinfo(struct file *file, struct dir_context *ctx) { - return proc_readfd_common(filp, dirent, filldir, + return proc_readfd_common(file, ctx, proc_fdinfo_instantiate); } @@ -364,6 +346,6 @@ const struct inode_operations proc_fdinfo_inode_operations = { const struct file_operations proc_fdinfo_operations = { .read = generic_read_dir, - .readdir = proc_readfdinfo, + .iterate = proc_readfdinfo, .llseek = default_llseek, }; diff --git a/fs/proc/fd.h b/fs/proc/fd.h index cbb1d47deda8..7c047f256ae2 100644 --- a/fs/proc/fd.h +++ b/fs/proc/fd.h @@ -11,4 +11,9 @@ extern const struct inode_operations proc_fdinfo_inode_operations; extern int proc_fd_permission(struct inode *inode, int mask); +static inline int proc_fd(struct inode *inode) +{ + return PROC_I(inode)->fd; +} + #endif /* __PROCFS_FD_H__ */ diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 21e1a8f1659d..94441a407337 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -36,212 +36,6 @@ static int proc_match(unsigned int len, const char *name, struct proc_dir_entry return !memcmp(name, de->name, len); } -/* buffer size is one page but our output routines use some slack for overruns */ -#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024) - -static ssize_t -__proc_file_read(struct file *file, char __user *buf, size_t nbytes, - loff_t *ppos) -{ - struct inode * inode = file_inode(file); - char *page; - ssize_t retval=0; - int eof=0; - ssize_t n, count; - char *start; - struct proc_dir_entry * dp; - unsigned long long pos; - - /* - * Gaah, please just use "seq_file" instead. The legacy /proc - * interfaces cut loff_t down to off_t for reads, and ignore - * the offset entirely for writes.. - */ - pos = *ppos; - if (pos > MAX_NON_LFS) - return 0; - if (nbytes > MAX_NON_LFS - pos) - nbytes = MAX_NON_LFS - pos; - - dp = PDE(inode); - if (!(page = (char*) __get_free_page(GFP_TEMPORARY))) - return -ENOMEM; - - while ((nbytes > 0) && !eof) { - count = min_t(size_t, PROC_BLOCK_SIZE, nbytes); - - start = NULL; - if (dp->read_proc) { - /* - * How to be a proc read function - * ------------------------------ - * Prototype: - * int f(char *buffer, char **start, off_t offset, - * int count, int *peof, void *dat) - * - * Assume that the buffer is "count" bytes in size. - * - * If you know you have supplied all the data you - * have, set *peof. - * - * You have three ways to return data: - * 0) Leave *start = NULL. (This is the default.) - * Put the data of the requested offset at that - * offset within the buffer. Return the number (n) - * of bytes there are from the beginning of the - * buffer up to the last byte of data. If the - * number of supplied bytes (= n - offset) is - * greater than zero and you didn't signal eof - * and the reader is prepared to take more data - * you will be called again with the requested - * offset advanced by the number of bytes - * absorbed. This interface is useful for files - * no larger than the buffer. - * 1) Set *start = an unsigned long value less than - * the buffer address but greater than zero. - * Put the data of the requested offset at the - * beginning of the buffer. Return the number of - * bytes of data placed there. If this number is - * greater than zero and you didn't signal eof - * and the reader is prepared to take more data - * you will be called again with the requested - * offset advanced by *start. This interface is - * useful when you have a large file consisting - * of a series of blocks which you want to count - * and return as wholes. - * (Hack by Paul.Russell@rustcorp.com.au) - * 2) Set *start = an address within the buffer. - * Put the data of the requested offset at *start. - * Return the number of bytes of data placed there. - * If this number is greater than zero and you - * didn't signal eof and the reader is prepared to - * take more data you will be called again with the - * requested offset advanced by the number of bytes - * absorbed. - */ - n = dp->read_proc(page, &start, *ppos, - count, &eof, dp->data); - } else - break; - - if (n == 0) /* end of file */ - break; - if (n < 0) { /* error */ - if (retval == 0) - retval = n; - break; - } - - if (start == NULL) { - if (n > PAGE_SIZE) /* Apparent buffer overflow */ - n = PAGE_SIZE; - n -= *ppos; - if (n <= 0) - break; - if (n > count) - n = count; - start = page + *ppos; - } else if (start < page) { - if (n > PAGE_SIZE) /* Apparent buffer overflow */ - n = PAGE_SIZE; - if (n > count) { - /* - * Don't reduce n because doing so might - * cut off part of a data block. - */ - pr_warn("proc_file_read: count exceeded\n"); - } - } else /* start >= page */ { - unsigned long startoff = (unsigned long)(start - page); - if (n > (PAGE_SIZE - startoff)) /* buffer overflow? */ - n = PAGE_SIZE - startoff; - if (n > count) - n = count; - } - - n -= copy_to_user(buf, start < page ? page : start, n); - if (n == 0) { - if (retval == 0) - retval = -EFAULT; - break; - } - - *ppos += start < page ? (unsigned long)start : n; - nbytes -= n; - buf += n; - retval += n; - } - free_page((unsigned long) page); - return retval; -} - -static ssize_t -proc_file_read(struct file *file, char __user *buf, size_t nbytes, - loff_t *ppos) -{ - struct proc_dir_entry *pde = PDE(file_inode(file)); - ssize_t rv = -EIO; - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; - } - pde->pde_users++; - spin_unlock(&pde->pde_unload_lock); - - rv = __proc_file_read(file, buf, nbytes, ppos); - - pde_users_dec(pde); - return rv; -} - -static ssize_t -proc_file_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - struct proc_dir_entry *pde = PDE(file_inode(file)); - ssize_t rv = -EIO; - - if (pde->write_proc) { - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; - } - pde->pde_users++; - spin_unlock(&pde->pde_unload_lock); - - /* FIXME: does this routine need ppos? probably... */ - rv = pde->write_proc(file, buffer, count, pde->data); - pde_users_dec(pde); - } - return rv; -} - - -static loff_t -proc_file_lseek(struct file *file, loff_t offset, int orig) -{ - loff_t retval = -EINVAL; - switch (orig) { - case 1: - offset += file->f_pos; - /* fallthrough */ - case 0: - if (offset < 0 || offset > MAX_NON_LFS) - break; - file->f_pos = retval = offset; - } - return retval; -} - -static const struct file_operations proc_file_operations = { - .llseek = proc_file_lseek, - .read = proc_file_read, - .write = proc_file_write, -}; - static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = dentry->d_inode; @@ -371,7 +165,7 @@ void proc_free_inum(unsigned int inum) static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) { - nd_set_link(nd, PDE(dentry->d_inode)->data); + nd_set_link(nd, __PDE_DATA(dentry->d_inode)); return NULL; } @@ -439,76 +233,52 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, * value of the readdir() call, as long as it's non-negative * for success.. */ -int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, - filldir_t filldir) +int proc_readdir_de(struct proc_dir_entry *de, struct file *file, + struct dir_context *ctx) { - unsigned int ino; int i; - struct inode *inode = file_inode(filp); - int ret = 0; - - ino = inode->i_ino; - i = filp->f_pos; - switch (i) { - case 0: - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - case 1: - if (filldir(dirent, "..", 2, i, - parent_ino(filp->f_path.dentry), - DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - default: - spin_lock(&proc_subdir_lock); - de = de->subdir; - i -= 2; - for (;;) { - if (!de) { - ret = 1; - spin_unlock(&proc_subdir_lock); - goto out; - } - if (!i) - break; - de = de->next; - i--; - } - do { - struct proc_dir_entry *next; - - /* filldir passes info to user space */ - pde_get(de); - spin_unlock(&proc_subdir_lock); - if (filldir(dirent, de->name, de->namelen, filp->f_pos, - de->low_ino, de->mode >> 12) < 0) { - pde_put(de); - goto out; - } - spin_lock(&proc_subdir_lock); - filp->f_pos++; - next = de->next; - pde_put(de); - de = next; - } while (de); + if (!dir_emit_dots(file, ctx)) + return 0; + + spin_lock(&proc_subdir_lock); + de = de->subdir; + i = ctx->pos - 2; + for (;;) { + if (!de) { spin_unlock(&proc_subdir_lock); + return 0; + } + if (!i) + break; + de = de->next; + i--; } - ret = 1; -out: - return ret; + + do { + struct proc_dir_entry *next; + pde_get(de); + spin_unlock(&proc_subdir_lock); + if (!dir_emit(ctx, de->name, de->namelen, + de->low_ino, de->mode >> 12)) { + pde_put(de); + return 0; + } + spin_lock(&proc_subdir_lock); + ctx->pos++; + next = de->next; + pde_put(de); + de = next; + } while (de); + spin_unlock(&proc_subdir_lock); + return 0; } -int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) +int proc_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); - return proc_readdir_de(PDE(inode), filp, dirent, filldir); + return proc_readdir_de(PDE(inode), file, ctx); } /* @@ -519,7 +289,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) static const struct file_operations proc_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = proc_readdir, + .iterate = proc_readdir, }; /* @@ -541,19 +311,17 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp return ret; if (S_ISDIR(dp->mode)) { - if (dp->proc_iops == NULL) { - dp->proc_fops = &proc_dir_operations; - dp->proc_iops = &proc_dir_inode_operations; - } + dp->proc_fops = &proc_dir_operations; + dp->proc_iops = &proc_dir_inode_operations; dir->nlink++; } else if (S_ISLNK(dp->mode)) { - if (dp->proc_iops == NULL) - dp->proc_iops = &proc_link_inode_operations; + dp->proc_iops = &proc_link_inode_operations; } else if (S_ISREG(dp->mode)) { - if (dp->proc_fops == NULL) - dp->proc_fops = &proc_file_operations; - if (dp->proc_iops == NULL) - dp->proc_iops = &proc_file_inode_operations; + BUG_ON(dp->proc_fops == NULL); + dp->proc_iops = &proc_file_inode_operations; + } else { + WARN_ON(1); + return -EINVAL; } spin_lock(&proc_subdir_lock); @@ -636,13 +404,17 @@ struct proc_dir_entry *proc_symlink(const char *name, } EXPORT_SYMBOL(proc_symlink); -struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, - struct proc_dir_entry *parent) +struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, void *data) { struct proc_dir_entry *ent; + if (mode == 0) + mode = S_IRUGO | S_IXUGO; + ent = __proc_create(&parent, name, S_IFDIR | mode, 2); if (ent) { + ent->data = data; if (proc_register(parent, ent) < 0) { kfree(ent); ent = NULL; @@ -650,82 +422,39 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, } return ent; } -EXPORT_SYMBOL(proc_mkdir_mode); +EXPORT_SYMBOL_GPL(proc_mkdir_data); -struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, - struct proc_dir_entry *parent) +struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, + struct proc_dir_entry *parent) { - struct proc_dir_entry *ent; - - ent = __proc_create(&parent, name, S_IFDIR | S_IRUGO | S_IXUGO, 2); - if (ent) { - ent->data = net; - if (proc_register(parent, ent) < 0) { - kfree(ent); - ent = NULL; - } - } - return ent; + return proc_mkdir_data(name, mode, parent, NULL); } -EXPORT_SYMBOL_GPL(proc_net_mkdir); +EXPORT_SYMBOL(proc_mkdir_mode); struct proc_dir_entry *proc_mkdir(const char *name, struct proc_dir_entry *parent) { - return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent); + return proc_mkdir_data(name, 0, parent, NULL); } EXPORT_SYMBOL(proc_mkdir); -struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode, - struct proc_dir_entry *parent) -{ - struct proc_dir_entry *ent; - nlink_t nlink; - - if (S_ISDIR(mode)) { - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO | S_IXUGO; - nlink = 2; - } else { - if ((mode & S_IFMT) == 0) - mode |= S_IFREG; - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO; - nlink = 1; - } - - ent = __proc_create(&parent, name, mode, nlink); - if (ent) { - if (proc_register(parent, ent) < 0) { - kfree(ent); - ent = NULL; - } - } - return ent; -} -EXPORT_SYMBOL(create_proc_entry); - struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops, void *data) { struct proc_dir_entry *pde; - nlink_t nlink; + if ((mode & S_IFMT) == 0) + mode |= S_IFREG; - if (S_ISDIR(mode)) { - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO | S_IXUGO; - nlink = 2; - } else { - if ((mode & S_IFMT) == 0) - mode |= S_IFREG; - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO; - nlink = 1; + if (!S_ISREG(mode)) { + WARN_ON(1); /* use proc_mkdir() */ + return NULL; } - pde = __proc_create(&parent, name, mode, nlink); + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO; + pde = __proc_create(&parent, name, mode, 1); if (!pde) goto out; pde->proc_fops = proc_fops; @@ -739,6 +468,19 @@ out: return NULL; } EXPORT_SYMBOL(proc_create_data); + +void proc_set_size(struct proc_dir_entry *de, loff_t size) +{ + de->size = size; +} +EXPORT_SYMBOL(proc_set_size); + +void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) +{ + de->uid = uid; + de->gid = gid; +} +EXPORT_SYMBOL(proc_set_user); static void free_proc_entry(struct proc_dir_entry *de) { @@ -755,41 +497,6 @@ void pde_put(struct proc_dir_entry *pde) free_proc_entry(pde); } -static void entry_rundown(struct proc_dir_entry *de) -{ - spin_lock(&de->pde_unload_lock); - /* - * Stop accepting new callers into module. If you're - * dynamically allocating ->proc_fops, save a pointer somewhere. - */ - de->proc_fops = NULL; - /* Wait until all existing callers into module are done. */ - if (de->pde_users > 0) { - DECLARE_COMPLETION_ONSTACK(c); - - if (!de->pde_unload_completion) - de->pde_unload_completion = &c; - - spin_unlock(&de->pde_unload_lock); - - wait_for_completion(de->pde_unload_completion); - - spin_lock(&de->pde_unload_lock); - } - - while (!list_empty(&de->pde_openers)) { - struct pde_opener *pdeo; - - pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); - list_del(&pdeo->lh); - spin_unlock(&de->pde_unload_lock); - pdeo->release(pdeo->inode, pdeo->file); - kfree(pdeo); - spin_lock(&de->pde_unload_lock); - } - spin_unlock(&de->pde_unload_lock); -} - /* * Remove a /proc entry and free it if it's not currently in use. */ @@ -821,7 +528,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) return; } - entry_rundown(de); + proc_entry_rundown(de); if (S_ISDIR(de->mode)) parent->nlink--; @@ -870,7 +577,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) } spin_unlock(&proc_subdir_lock); - entry_rundown(de); + proc_entry_rundown(de); next = de->parent; if (S_ISDIR(de->mode)) next->nlink--; @@ -886,3 +593,23 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) return 0; } EXPORT_SYMBOL(remove_proc_subtree); + +void *proc_get_parent_data(const struct inode *inode) +{ + struct proc_dir_entry *de = PDE(inode); + return de->parent->data; +} +EXPORT_SYMBOL_GPL(proc_get_parent_data); + +void proc_remove(struct proc_dir_entry *de) +{ + if (de) + remove_proc_subtree(de->name, de->parent); +} +EXPORT_SYMBOL(proc_remove); + +void *PDE_DATA(const struct inode *inode) +{ + return __PDE_DATA(inode); +} +EXPORT_SYMBOL(PDE_DATA); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 869116c2afbe..073aea60cf8f 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -22,6 +22,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/mount.h> +#include <linux/magic.h> #include <asm/uaccess.h> @@ -50,8 +51,8 @@ static void proc_evict_inode(struct inode *inode) sysctl_head_put(head); } /* Release any associated namespace */ - ns_ops = PROC_I(inode)->ns_ops; - ns = PROC_I(inode)->ns; + ns_ops = PROC_I(inode)->ns.ns_ops; + ns = PROC_I(inode)->ns.ns; if (ns_ops && ns) ns_ops->put(ns); } @@ -72,8 +73,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; - ei->ns = NULL; - ei->ns_ops = NULL; + ei->ns.ns = NULL; + ei->ns.ns_ops = NULL; inode = &ei->vfs_inode; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; return inode; @@ -129,96 +130,100 @@ static const struct super_operations proc_sops = { .show_options = proc_show_options, }; -static void __pde_users_dec(struct proc_dir_entry *pde) +enum {BIAS = -1U<<31}; + +static inline int use_pde(struct proc_dir_entry *pde) +{ + return atomic_inc_unless_negative(&pde->in_use); +} + +static void unuse_pde(struct proc_dir_entry *pde) { - pde->pde_users--; - if (pde->pde_unload_completion && pde->pde_users == 0) + if (atomic_dec_return(&pde->in_use) == BIAS) complete(pde->pde_unload_completion); } -void pde_users_dec(struct proc_dir_entry *pde) +/* pde is locked */ +static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) { - spin_lock(&pde->pde_unload_lock); - __pde_users_dec(pde); - spin_unlock(&pde->pde_unload_lock); + if (pdeo->closing) { + /* somebody else is doing that, just wait */ + DECLARE_COMPLETION_ONSTACK(c); + pdeo->c = &c; + spin_unlock(&pde->pde_unload_lock); + wait_for_completion(&c); + spin_lock(&pde->pde_unload_lock); + } else { + struct file *file; + pdeo->closing = 1; + spin_unlock(&pde->pde_unload_lock); + file = pdeo->file; + pde->proc_fops->release(file_inode(file), file); + spin_lock(&pde->pde_unload_lock); + list_del_init(&pdeo->lh); + if (pdeo->c) + complete(pdeo->c); + kfree(pdeo); + } +} + +void proc_entry_rundown(struct proc_dir_entry *de) +{ + DECLARE_COMPLETION_ONSTACK(c); + /* Wait until all existing callers into module are done. */ + de->pde_unload_completion = &c; + if (atomic_add_return(BIAS, &de->in_use) != BIAS) + wait_for_completion(&c); + + spin_lock(&de->pde_unload_lock); + while (!list_empty(&de->pde_openers)) { + struct pde_opener *pdeo; + pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); + close_pdeo(de, pdeo); + } + spin_unlock(&de->pde_unload_lock); } static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; - loff_t (*llseek)(struct file *, loff_t, int); - - spin_lock(&pde->pde_unload_lock); - /* - * remove_proc_entry() is going to delete PDE (as part of module - * cleanup sequence). No new callers into module allowed. - */ - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + loff_t (*llseek)(struct file *, loff_t, int); + llseek = pde->proc_fops->llseek; + if (!llseek) + llseek = default_llseek; + rv = llseek(file, offset, whence); + unuse_pde(pde); } - /* - * Bump refcount so that remove_proc_entry will wail for ->llseek to - * complete. - */ - pde->pde_users++; - /* - * Save function pointer under lock, to protect against ->proc_fops - * NULL'ifying right after ->pde_unload_lock is dropped. - */ - llseek = pde->proc_fops->llseek; - spin_unlock(&pde->pde_unload_lock); - - if (!llseek) - llseek = default_llseek; - rv = llseek(file, offset, whence); - - pde_users_dec(pde); return rv; } static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + read = pde->proc_fops->read; + if (read) + rv = read(file, buf, count, ppos); + unuse_pde(pde); } - pde->pde_users++; - read = pde->proc_fops->read; - spin_unlock(&pde->pde_unload_lock); - - if (read) - rv = read(file, buf, count, ppos); - - pde_users_dec(pde); return rv; } static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { + ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + write = pde->proc_fops->write; + if (write) + rv = write(file, buf, count, ppos); + unuse_pde(pde); } - pde->pde_users++; - write = pde->proc_fops->write; - spin_unlock(&pde->pde_unload_lock); - - if (write) - rv = write(file, buf, count, ppos); - - pde_users_dec(pde); return rv; } @@ -227,20 +232,12 @@ static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *p struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned int rv = DEFAULT_POLLMASK; unsigned int (*poll)(struct file *, struct poll_table_struct *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + poll = pde->proc_fops->poll; + if (poll) + rv = poll(file, pts); + unuse_pde(pde); } - pde->pde_users++; - poll = pde->proc_fops->poll; - spin_unlock(&pde->pde_unload_lock); - - if (poll) - rv = poll(file, pts); - - pde_users_dec(pde); return rv; } @@ -249,20 +246,12 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; long (*ioctl)(struct file *, unsigned int, unsigned long); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + ioctl = pde->proc_fops->unlocked_ioctl; + if (ioctl) + rv = ioctl(file, cmd, arg); + unuse_pde(pde); } - pde->pde_users++; - ioctl = pde->proc_fops->unlocked_ioctl; - spin_unlock(&pde->pde_unload_lock); - - if (ioctl) - rv = ioctl(file, cmd, arg); - - pde_users_dec(pde); return rv; } @@ -272,20 +261,12 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; long (*compat_ioctl)(struct file *, unsigned int, unsigned long); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + compat_ioctl = pde->proc_fops->compat_ioctl; + if (compat_ioctl) + rv = compat_ioctl(file, cmd, arg); + unuse_pde(pde); } - pde->pde_users++; - compat_ioctl = pde->proc_fops->compat_ioctl; - spin_unlock(&pde->pde_unload_lock); - - if (compat_ioctl) - rv = compat_ioctl(file, cmd, arg); - - pde_users_dec(pde); return rv; } #endif @@ -295,20 +276,12 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; int (*mmap)(struct file *, struct vm_area_struct *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + mmap = pde->proc_fops->mmap; + if (mmap) + rv = mmap(file, vma); + unuse_pde(pde); } - pde->pde_users++; - mmap = pde->proc_fops->mmap; - spin_unlock(&pde->pde_unload_lock); - - if (mmap) - rv = mmap(file, vma); - - pde_users_dec(pde); return rv; } @@ -330,91 +303,47 @@ static int proc_reg_open(struct inode *inode, struct file *file) * by hand in remove_proc_entry(). For this, save opener's credentials * for later. */ - pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); + pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL); if (!pdeo) return -ENOMEM; - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + if (!use_pde(pde)) { kfree(pdeo); return -ENOENT; } - pde->pde_users++; open = pde->proc_fops->open; release = pde->proc_fops->release; - spin_unlock(&pde->pde_unload_lock); if (open) rv = open(inode, file); - spin_lock(&pde->pde_unload_lock); if (rv == 0 && release) { /* To know what to release. */ - pdeo->inode = inode; pdeo->file = file; /* Strictly for "too late" ->release in proc_reg_release(). */ - pdeo->release = release; + spin_lock(&pde->pde_unload_lock); list_add(&pdeo->lh, &pde->pde_openers); + spin_unlock(&pde->pde_unload_lock); } else kfree(pdeo); - __pde_users_dec(pde); - spin_unlock(&pde->pde_unload_lock); - return rv; -} - -static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde, - struct inode *inode, struct file *file) -{ - struct pde_opener *pdeo; - list_for_each_entry(pdeo, &pde->pde_openers, lh) { - if (pdeo->inode == inode && pdeo->file == file) - return pdeo; - } - return NULL; + unuse_pde(pde); + return rv; } static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); - int rv = 0; - int (*release)(struct inode *, struct file *); struct pde_opener *pdeo; - spin_lock(&pde->pde_unload_lock); - pdeo = find_pde_opener(pde, inode, file); - if (!pde->proc_fops) { - /* - * Can't simply exit, __fput() will think that everything is OK, - * and move on to freeing struct file. remove_proc_entry() will - * find slacker in opener's list and will try to do non-trivial - * things with struct file. Therefore, remove opener from list. - * - * But if opener is removed from list, who will ->release it? - */ - if (pdeo) { - list_del(&pdeo->lh); - spin_unlock(&pde->pde_unload_lock); - rv = pdeo->release(inode, file); - kfree(pdeo); - } else - spin_unlock(&pde->pde_unload_lock); - return rv; - } - pde->pde_users++; - release = pde->proc_fops->release; - if (pdeo) { - list_del(&pdeo->lh); - kfree(pdeo); + list_for_each_entry(pdeo, &pde->pde_openers, lh) { + if (pdeo->file == file) { + close_pdeo(pde, pdeo); + break; + } } spin_unlock(&pde->pde_unload_lock); - - if (release) - rv = release(inode, file); - - pde_users_dec(pde); - return rv; + return 0; } static const struct file_operations proc_reg_file_ops = { @@ -462,8 +391,8 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_size = de->size; if (de->nlink) set_nlink(inode, de->nlink); - if (de->proc_iops) - inode->i_op = de->proc_iops; + WARN_ON(!de->proc_iops); + inode->i_op = de->proc_iops; if (de->proc_fops) { if (S_ISREG(inode->i_mode)) { #ifdef CONFIG_COMPAT @@ -506,5 +435,5 @@ int proc_fill_super(struct super_block *s) return -ENOMEM; } - return 0; + return proc_setup_self(s); } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 85ff3a4598b3..651d09a11dde 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -1,4 +1,4 @@ -/* internal.h: internal procfs definitions +/* Internal procfs definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) @@ -9,80 +9,83 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/sched.h> #include <linux/proc_fs.h> +#include <linux/proc_ns.h> +#include <linux/spinlock.h> +#include <linux/atomic.h> #include <linux/binfmts.h> -struct ctl_table_header; -struct mempolicy; -extern struct proc_dir_entry proc_root; -extern void proc_self_init(void); -#ifdef CONFIG_PROC_SYSCTL -extern int proc_sys_init(void); -extern void sysctl_head_put(struct ctl_table_header *head); -#else -static inline void proc_sys_init(void) { } -static inline void sysctl_head_put(struct ctl_table_header *head) { } -#endif -#ifdef CONFIG_NET -extern int proc_net_init(void); -#else -static inline int proc_net_init(void) { return 0; } -#endif +struct ctl_table_header; +struct mempolicy; -struct vmalloc_info { - unsigned long used; - unsigned long largest_chunk; +/* + * This is not completely implemented yet. The idea is to + * create an in-memory tree (like the actual /proc filesystem + * tree) of these proc_dir_entries, so that we can dynamically + * add new files to /proc. + * + * The "next" pointer creates a linked list of one /proc directory, + * while parent/subdir create the directory structure (every + * /proc file has a parent, but "subdir" is NULL for all + * non-directory entries). + */ +struct proc_dir_entry { + unsigned int low_ino; + umode_t mode; + nlink_t nlink; + kuid_t uid; + kgid_t gid; + loff_t size; + const struct inode_operations *proc_iops; + const struct file_operations *proc_fops; + struct proc_dir_entry *next, *parent, *subdir; + void *data; + atomic_t count; /* use count */ + atomic_t in_use; /* number of callers into module in progress; */ + /* negative -> it's going away RSN */ + struct completion *pde_unload_completion; + struct list_head pde_openers; /* who did ->open, but not ->release */ + spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ + u8 namelen; + char name[]; }; -#ifdef CONFIG_MMU -#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) -extern void get_vmalloc_info(struct vmalloc_info *vmi); -#else - -#define VMALLOC_TOTAL 0UL -#define get_vmalloc_info(vmi) \ -do { \ - (vmi)->used = 0; \ - (vmi)->largest_chunk = 0; \ -} while(0) -#endif - -extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); - -extern const struct file_operations proc_tid_children_operations; -extern const struct file_operations proc_pid_maps_operations; -extern const struct file_operations proc_tid_maps_operations; -extern const struct file_operations proc_pid_numa_maps_operations; -extern const struct file_operations proc_tid_numa_maps_operations; -extern const struct file_operations proc_pid_smaps_operations; -extern const struct file_operations proc_tid_smaps_operations; -extern const struct file_operations proc_clear_refs_operations; -extern const struct file_operations proc_pagemap_operations; -extern const struct file_operations proc_net_operations; -extern const struct inode_operations proc_net_inode_operations; -extern const struct inode_operations proc_pid_link_inode_operations; +union proc_op { + int (*proc_get_link)(struct dentry *, struct path *); + int (*proc_read)(struct task_struct *task, char *page); + int (*proc_show)(struct seq_file *m, + struct pid_namespace *ns, struct pid *pid, + struct task_struct *task); +}; -struct proc_maps_private { +struct proc_inode { struct pid *pid; - struct task_struct *task; -#ifdef CONFIG_MMU - struct vm_area_struct *tail_vma; -#endif -#ifdef CONFIG_NUMA - struct mempolicy *task_mempolicy; -#endif + int fd; + union proc_op op; + struct proc_dir_entry *pde; + struct ctl_table_header *sysctl; + struct ctl_table *sysctl_entry; + struct proc_ns ns; + struct inode vfs_inode; }; -void proc_init_inodecache(void); +/* + * General functions + */ +static inline struct proc_inode *PROC_I(const struct inode *inode) +{ + return container_of(inode, struct proc_inode, vfs_inode); +} + +static inline struct proc_dir_entry *PDE(const struct inode *inode) +{ + return PROC_I(inode)->pde; +} + +static inline void *__PDE_DATA(const struct inode *inode) +{ + return PDE(inode)->data; +} static inline struct pid *proc_pid(struct inode *inode) { @@ -94,11 +97,6 @@ static inline struct task_struct *get_proc_task(struct inode *inode) return get_pid_task(proc_pid(inode), PIDTYPE_PID); } -static inline int proc_fd(struct inode *inode) -{ - return PROC_I(inode)->fd; -} - static inline int task_dumpable(struct task_struct *task) { int dumpable = 0; @@ -114,15 +112,6 @@ static inline int task_dumpable(struct task_struct *task) return 0; } -static inline int pid_delete_dentry(const struct dentry * dentry) -{ - /* Is the task we represent dead? - * If so, then don't put the dentry on the lru list, - * kill it immediately. - */ - return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; -} - static inline unsigned name_to_int(struct dentry *dentry) { const char *name = dentry->d_name.name; @@ -145,63 +134,165 @@ out: return ~0U; } -struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, - struct dentry *dentry); -int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, - filldir_t filldir); +/* + * Offset of the first process in the /proc root directory.. + */ +#define FIRST_PROCESS_ENTRY 256 -struct pde_opener { - struct inode *inode; - struct file *file; - int (*release)(struct inode *, struct file *); - struct list_head lh; -}; -void pde_users_dec(struct proc_dir_entry *pde); +/* Worst case buffer size needed for holding an integer. */ +#define PROC_NUMBUF 13 + +/* + * array.c + */ +extern const struct file_operations proc_tid_children_operations; + +extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_status(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); + +/* + * base.c + */ +extern const struct dentry_operations pid_dentry_operations; +extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int proc_setattr(struct dentry *, struct iattr *); +extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *); +extern int pid_revalidate(struct dentry *, unsigned int); +extern int pid_delete_dentry(const struct dentry *); +extern int proc_pid_readdir(struct file *, struct dir_context *); +extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); +extern loff_t mem_lseek(struct file *, loff_t, int); +/* Lookups */ +typedef int instantiate_t(struct inode *, struct dentry *, + struct task_struct *, const void *); +extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int, + instantiate_t, struct task_struct *, const void *); + +/* + * generic.c + */ extern spinlock_t proc_subdir_lock; -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int); -int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); -unsigned long task_vsize(struct mm_struct *); -unsigned long task_statm(struct mm_struct *, - unsigned long *, unsigned long *, unsigned long *, unsigned long *); -void task_mem(struct seq_file *, struct mm_struct *); +extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); +extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, + struct dentry *); +extern int proc_readdir(struct file *, struct dir_context *); +extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *); static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { atomic_inc(&pde->count); return pde; } -void pde_put(struct proc_dir_entry *pde); - -int proc_fill_super(struct super_block *); -struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); -int proc_remount(struct super_block *sb, int *flags, char *data); +extern void pde_put(struct proc_dir_entry *); /* - * These are generic /proc routines that use the internal - * "struct proc_dir_entry" tree to traverse the filesystem. - * - * The /proc root directory has extended versions to take care - * of the /proc/<pid> subdirectories. + * inode.c */ -int proc_readdir(struct file *, void *, filldir_t); -struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); +struct pde_opener { + struct file *file; + struct list_head lh; + int closing; + struct completion *c; +}; +extern const struct inode_operations proc_pid_link_inode_operations; +extern void proc_init_inodecache(void); +extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); +extern int proc_fill_super(struct super_block *); +extern void proc_entry_rundown(struct proc_dir_entry *); -/* Lookups */ -typedef struct dentry *instantiate_t(struct inode *, struct dentry *, - struct task_struct *, const void *); -int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - const char *name, int len, - instantiate_t instantiate, struct task_struct *task, const void *ptr); -int pid_revalidate(struct dentry *dentry, unsigned int flags); -struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task); -extern const struct dentry_operations pid_dentry_operations; -int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -int proc_setattr(struct dentry *dentry, struct iattr *attr); +/* + * proc_devtree.c + */ +#ifdef CONFIG_PROC_DEVICETREE +extern void proc_device_tree_init(void); +#endif +/* + * proc_namespaces.c + */ extern const struct inode_operations proc_ns_dir_inode_operations; extern const struct file_operations proc_ns_dir_operations; +/* + * proc_net.c + */ +extern const struct file_operations proc_net_operations; +extern const struct inode_operations proc_net_inode_operations; + +#ifdef CONFIG_NET +extern int proc_net_init(void); +#else +static inline int proc_net_init(void) { return 0; } +#endif + +/* + * proc_self.c + */ +extern int proc_setup_self(struct super_block *); + +/* + * proc_sysctl.c + */ +#ifdef CONFIG_PROC_SYSCTL +extern int proc_sys_init(void); +extern void sysctl_head_put(struct ctl_table_header *); +#else +static inline void proc_sys_init(void) { } +static inline void sysctl_head_put(struct ctl_table_header *head) { } +#endif + +/* + * proc_tty.c + */ +#ifdef CONFIG_TTY +extern void proc_tty_init(void); +#else +static inline void proc_tty_init(void) {} +#endif + +/* + * root.c + */ +extern struct proc_dir_entry proc_root; + +extern void proc_self_init(void); +extern int proc_remount(struct super_block *, int *, char *); + +/* + * task_[no]mmu.c + */ +struct proc_maps_private { + struct pid *pid; + struct task_struct *task; +#ifdef CONFIG_MMU + struct vm_area_struct *tail_vma; +#endif +#ifdef CONFIG_NUMA + struct mempolicy *task_mempolicy; +#endif +}; + +extern const struct file_operations proc_pid_maps_operations; +extern const struct file_operations proc_tid_maps_operations; +extern const struct file_operations proc_pid_numa_maps_operations; +extern const struct file_operations proc_tid_numa_maps_operations; +extern const struct file_operations proc_pid_smaps_operations; +extern const struct file_operations proc_tid_smaps_operations; +extern const struct file_operations proc_clear_refs_operations; +extern const struct file_operations proc_pagemap_operations; + +extern unsigned long task_vsize(struct mm_struct *); +extern unsigned long task_statm(struct mm_struct *, + unsigned long *, unsigned long *, + unsigned long *, unsigned long *); +extern void task_mem(struct seq_file *, struct mm_struct *); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index eda6f017f272..06ea155e1a59 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -11,10 +11,12 @@ #include <linux/mm.h> #include <linux/proc_fs.h> +#include <linux/kcore.h> #include <linux/user.h> #include <linux/capability.h> #include <linux/elf.h> #include <linux/elfcore.h> +#include <linux/notifier.h> #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/printk.h> @@ -27,6 +29,7 @@ #include <linux/ioport.h> #include <linux/memory.h> #include <asm/sections.h> +#include "internal.h" #define CORE_STR "CORE" @@ -405,7 +408,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) prpsinfo.pr_zomb = 0; strcpy(prpsinfo.pr_fname, "vmlinux"); - strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ); + strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs)); nhdr->p_filesz += notesize(¬es[1]); bufp = storenote(¬es[1], bufp); @@ -564,7 +567,6 @@ static const struct file_operations proc_kcore_operations = { .llseek = default_llseek, }; -#ifdef CONFIG_MEMORY_HOTPLUG /* just remember that we have to update kcore */ static int __meminit kcore_callback(struct notifier_block *self, unsigned long action, void *arg) @@ -578,8 +580,11 @@ static int __meminit kcore_callback(struct notifier_block *self, } return NOTIFY_OK; } -#endif +static struct notifier_block kcore_callback_nb __meminitdata = { + .notifier_call = kcore_callback, + .priority = 0, +}; static struct kcore_list kcore_vmalloc; @@ -631,7 +636,7 @@ static int __init proc_kcore_init(void) add_modules_range(); /* Store direct-map area from physical memory map */ kcore_update_ram(); - hotplug_memory_notifier(kcore_callback, 0); + register_hotmemory_notifier(&kcore_callback_nb); return 0; } diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index bd4b5a740ff1..bdfabdaefdce 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait; static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE); + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC); return 0; } @@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { if ((file->f_flags & O_NONBLOCK) && - !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return -EAGAIN; - return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return POLLIN | POLLRDNORM; return 0; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 1efaaa19c4f3..5aa847a603c0 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -11,6 +11,7 @@ #include <linux/swap.h> #include <linux/vmstat.h> #include <linux/atomic.h> +#include <linux/vmalloc.h> #include <asm/page.h> #include <asm/pgtable.h> #include "internal.h" diff --git a/fs/proc/mmu.c b/fs/proc/mmu.c deleted file mode 100644 index 8ae221dfd010..000000000000 --- a/fs/proc/mmu.c +++ /dev/null @@ -1,60 +0,0 @@ -/* mmu.c: mmu memory info files - * - * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include <linux/spinlock.h> -#include <linux/vmalloc.h> -#include <linux/highmem.h> -#include <asm/pgtable.h> -#include "internal.h" - -void get_vmalloc_info(struct vmalloc_info *vmi) -{ - struct vm_struct *vma; - unsigned long free_area_size; - unsigned long prev_end; - - vmi->used = 0; - - if (!vmlist) { - vmi->largest_chunk = VMALLOC_TOTAL; - } - else { - vmi->largest_chunk = 0; - - prev_end = VMALLOC_START; - - read_lock(&vmlist_lock); - - for (vma = vmlist; vma; vma = vma->next) { - unsigned long addr = (unsigned long) vma->addr; - - /* - * Some archs keep another range for modules in vmlist - */ - if (addr < VMALLOC_START) - continue; - if (addr >= VMALLOC_END) - break; - - vmi->used += vma->size; - - free_area_size = addr - prev_end; - if (vmi->largest_chunk < free_area_size) - vmi->largest_chunk = free_area_size; - - prev_end = vma->size + addr; - } - - if (VMALLOC_END - prev_end > vmi->largest_chunk) - vmi->largest_chunk = VMALLOC_END - prev_end; - - read_unlock(&vmlist_lock); - } -} diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 66b51c0383da..49a7fff2e83a 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -51,7 +51,7 @@ static int ns_delete_dentry(const struct dentry *dentry) static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = dentry->d_inode; - const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops; return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", ns_ops->name, inode->i_ino); @@ -95,8 +95,8 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb, inode->i_op = &ns_inode_operations; inode->i_mode = S_IFREG | S_IRUGO; inode->i_fop = &ns_file_operations; - ei->ns_ops = ns_ops; - ei->ns = ns; + ei->ns.ns_ops = ns_ops; + ei->ns.ns = ns; unlock_new_inode(inode); } else { ns_ops->put(ns); @@ -128,7 +128,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; - ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); + ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops); if (IS_ERR(ns_path.dentry)) { error = ERR_CAST(ns_path.dentry); goto out_put_task; @@ -148,7 +148,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl { struct inode *inode = dentry->d_inode; struct proc_inode *ei = PROC_I(inode); - const struct proc_ns_operations *ns_ops = ei->ns_ops; + const struct proc_ns_operations *ns_ops = ei->ns.ns_ops; struct task_struct *task; void *ns; char name[50]; @@ -187,13 +187,12 @@ static const struct inode_operations proc_ns_link_inode_operations = { .setattr = proc_setattr, }; -static struct dentry *proc_ns_instantiate(struct inode *dir, +static int proc_ns_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { const struct proc_ns_operations *ns_ops = ptr; struct inode *inode; struct proc_inode *ei; - struct dentry *error = ERR_PTR(-ENOENT); inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) @@ -202,96 +201,58 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, ei = PROC_I(inode); inode->i_mode = S_IFLNK|S_IRWXUGO; inode->i_op = &proc_ns_link_inode_operations; - ei->ns_ops = ns_ops; + ei->ns.ns_ops = ns_ops; d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (pid_revalidate(dentry, 0)) - error = NULL; + return 0; out: - return error; -} - -static int proc_ns_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, struct task_struct *task, - const struct proc_ns_operations *ops) -{ - return proc_fill_cache(filp, dirent, filldir, - ops->name, strlen(ops->name), - proc_ns_instantiate, task, ops); + return -ENOENT; } -static int proc_ns_dir_readdir(struct file *filp, void *dirent, - filldir_t filldir) +static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) { - int i; - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); + struct task_struct *task = get_proc_task(file_inode(file)); const struct proc_ns_operations **entry, **last; - ino_t ino; - int ret; - ret = -ENOENT; if (!task) - goto out_no_task; + return -ENOENT; - ret = 0; - i = filp->f_pos; - switch (i) { - case 0: - ino = inode->i_ino; - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - default: - i -= 2; - if (i >= ARRAY_SIZE(ns_entries)) { - ret = 1; - goto out; - } - entry = ns_entries + i; - last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; - while (entry <= last) { - if (proc_ns_fill_cache(filp, dirent, filldir, - task, *entry) < 0) - goto out; - filp->f_pos++; - entry++; - } + if (!dir_emit_dots(file, ctx)) + goto out; + if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries)) + goto out; + entry = ns_entries + (ctx->pos - 2); + last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; + while (entry <= last) { + const struct proc_ns_operations *ops = *entry; + if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name), + proc_ns_instantiate, task, ops)) + break; + ctx->pos++; + entry++; } - - ret = 1; out: put_task_struct(task); -out_no_task: - return ret; + return 0; } const struct file_operations proc_ns_dir_operations = { .read = generic_read_dir, - .readdir = proc_ns_dir_readdir, + .iterate = proc_ns_dir_readdir, }; static struct dentry *proc_ns_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct dentry *error; + int error; struct task_struct *task = get_proc_task(dir); const struct proc_ns_operations **entry, **last; unsigned int len = dentry->d_name.len; - error = ERR_PTR(-ENOENT); + error = -ENOENT; if (!task) goto out_no_task; @@ -310,7 +271,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, out: put_task_struct(task); out_no_task: - return error; + return ERR_PTR(error); } const struct inode_operations proc_ns_dir_inode_operations = { @@ -337,6 +298,11 @@ out_invalid: return ERR_PTR(-EINVAL); } +struct proc_ns *get_proc_ns(struct inode *inode) +{ + return &PROC_I(inode)->ns; +} + bool proc_ns_inode(struct inode *inode) { return inode->i_fop == &ns_file_operations; diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 30b590f5bd35..106a83570630 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -12,7 +12,7 @@ #include <linux/stat.h> #include <linux/string.h> #include <linux/of.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <asm/prom.h> #include <asm/uaccess.h> @@ -41,7 +41,7 @@ static int property_proc_show(struct seq_file *m, void *v) static int property_proc_open(struct inode *inode, struct file *file) { - return single_open(file, property_proc_show, PDE(inode)->data); + return single_open(file, property_proc_show, __PDE_DATA(inode)); } static const struct file_operations property_proc_fops = { diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index b4ac6572474f..4677bb7dc7c2 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -26,6 +26,10 @@ #include "internal.h" +static inline struct net *PDE_NET(struct proc_dir_entry *pde) +{ + return pde->parent->data; +} static struct net *get_proc_net(const struct inode *inode) { @@ -156,16 +160,15 @@ const struct inode_operations proc_net_inode_operations = { .getattr = proc_tgid_net_getattr, }; -static int proc_tgid_net_readdir(struct file *filp, void *dirent, - filldir_t filldir) +static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) { int ret; struct net *net; ret = -EINVAL; - net = get_proc_task_net(file_inode(filp)); + net = get_proc_task_net(file_inode(file)); if (net != NULL) { - ret = proc_readdir_de(net->proc_net, filp, dirent, filldir); + ret = proc_readdir_de(net->proc_net, file, ctx); put_net(net); } return ret; @@ -174,7 +177,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent, const struct file_operations proc_net_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = proc_tgid_net_readdir, + .iterate = proc_tgid_net_readdir, }; static __net_init int proc_net_ns_init(struct net *net) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ac05f33a0dde..71290463a1d3 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -573,12 +573,12 @@ out: return ret; } -static int proc_sys_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, +static bool proc_sys_fill_cache(struct file *file, + struct dir_context *ctx, struct ctl_table_header *head, struct ctl_table *table) { - struct dentry *child, *dir = filp->f_path.dentry; + struct dentry *child, *dir = file->f_path.dentry; struct inode *inode; struct qstr qname; ino_t ino = 0; @@ -595,38 +595,38 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent, inode = proc_sys_make_inode(dir->d_sb, head, table); if (!inode) { dput(child); - return -ENOMEM; + return false; } else { d_set_d_op(child, &proc_sys_dentry_operations); d_add(child, inode); } } else { - return -ENOMEM; + return false; } } inode = child->d_inode; ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); - return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type); + return dir_emit(ctx, qname.name, qname.len, ino, type); } -static int proc_sys_link_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, +static bool proc_sys_link_fill_cache(struct file *file, + struct dir_context *ctx, struct ctl_table_header *head, struct ctl_table *table) { - int err, ret = 0; + bool ret = true; head = sysctl_head_grab(head); if (S_ISLNK(table->mode)) { /* It is not an error if we can not follow the link ignore it */ - err = sysctl_follow_link(&head, &table, current->nsproxy); + int err = sysctl_follow_link(&head, &table, current->nsproxy); if (err) goto out; } - ret = proc_sys_fill_cache(filp, dirent, filldir, head, table); + ret = proc_sys_fill_cache(file, ctx, head, table); out: sysctl_head_finish(head); return ret; @@ -634,67 +634,50 @@ out: static int scan(struct ctl_table_header *head, ctl_table *table, unsigned long *pos, struct file *file, - void *dirent, filldir_t filldir) + struct dir_context *ctx) { - int res; + bool res; - if ((*pos)++ < file->f_pos) - return 0; + if ((*pos)++ < ctx->pos) + return true; if (unlikely(S_ISLNK(table->mode))) - res = proc_sys_link_fill_cache(file, dirent, filldir, head, table); + res = proc_sys_link_fill_cache(file, ctx, head, table); else - res = proc_sys_fill_cache(file, dirent, filldir, head, table); + res = proc_sys_fill_cache(file, ctx, head, table); - if (res == 0) - file->f_pos = *pos; + if (res) + ctx->pos = *pos; return res; } -static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int proc_sys_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct ctl_table_header *head = grab_header(inode); + struct ctl_table_header *head = grab_header(file_inode(file)); struct ctl_table_header *h = NULL; struct ctl_table *entry; struct ctl_dir *ctl_dir; unsigned long pos; - int ret = -EINVAL; if (IS_ERR(head)) return PTR_ERR(head); ctl_dir = container_of(head, struct ctl_dir, header); - ret = 0; - /* Avoid a switch here: arm builds fail with missing __cmpdi2 */ - if (filp->f_pos == 0) { - if (filldir(dirent, ".", 1, filp->f_pos, - inode->i_ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - } - if (filp->f_pos == 1) { - if (filldir(dirent, "..", 2, filp->f_pos, - parent_ino(dentry), DT_DIR) < 0) - goto out; - filp->f_pos++; - } + if (!dir_emit_dots(file, ctx)) + return 0; + pos = 2; for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { - ret = scan(h, entry, &pos, filp, dirent, filldir); - if (ret) { + if (!scan(h, entry, &pos, file, ctx)) { sysctl_head_finish(h); break; } } - ret = 1; -out: sysctl_head_finish(head); - return ret; + return 0; } static int proc_sys_permission(struct inode *inode, int mask) @@ -769,7 +752,7 @@ static const struct file_operations proc_sys_file_operations = { static const struct file_operations proc_sys_dir_file_operations = { .read = generic_read_dir, - .readdir = proc_sys_readdir, + .iterate = proc_sys_readdir, .llseek = generic_file_llseek, }; @@ -813,15 +796,16 @@ static int sysctl_is_seen(struct ctl_table_header *p) return res; } -static int proc_sys_compare(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct ctl_table_header *head; + struct inode *inode; + /* Although proc doesn't have negative dentries, rcu-walk means * that inode here can be NULL */ /* AV: can it, indeed? */ + inode = ACCESS_ONCE(dentry->d_inode); if (!inode) return 1; if (name->len != len) diff --git a/fs/proc/root.c b/fs/proc/root.c index 9c7fab1d23f0..229e366598da 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -141,6 +141,8 @@ static void proc_kill_sb(struct super_block *sb) struct pid_namespace *ns; ns = (struct pid_namespace *)sb->s_fs_info; + if (ns->proc_self) + dput(ns->proc_self); kill_anon_super(sb); put_pid_ns(ns); } @@ -200,21 +202,14 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr return proc_pid_lookup(dir, dentry, flags); } -static int proc_root_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int proc_root_readdir(struct file *file, struct dir_context *ctx) { - unsigned int nr = filp->f_pos; - int ret; - - if (nr < FIRST_PROCESS_ENTRY) { - int error = proc_readdir(filp, dirent, filldir); - if (error <= 0) - return error; - filp->f_pos = FIRST_PROCESS_ENTRY; + if (ctx->pos < FIRST_PROCESS_ENTRY) { + proc_readdir(file, ctx); + ctx->pos = FIRST_PROCESS_ENTRY; } - ret = proc_pid_readdir(filp, dirent, filldir); - return ret; + return proc_pid_readdir(file, ctx); } /* @@ -224,7 +219,7 @@ static int proc_root_readdir(struct file * filp, */ static const struct file_operations proc_root_operations = { .read = generic_read_dir, - .readdir = proc_root_readdir, + .iterate = proc_root_readdir, .llseek = default_llseek, }; diff --git a/fs/proc/self.c b/fs/proc/self.c index aa5cc3bff140..6b6a993b5c25 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -1,6 +1,8 @@ -#include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/namei.h> +#include <linux/slab.h> +#include <linux/pid_namespace.h> +#include "internal.h" /* * /proc/self: @@ -48,12 +50,43 @@ static const struct inode_operations proc_self_inode_operations = { .put_link = proc_self_put_link, }; -void __init proc_self_init(void) +static unsigned self_inum; + +int proc_setup_self(struct super_block *s) { - struct proc_dir_entry *proc_self_symlink; - mode_t mode; + struct inode *root_inode = s->s_root->d_inode; + struct pid_namespace *ns = s->s_fs_info; + struct dentry *self; + + mutex_lock(&root_inode->i_mutex); + self = d_alloc_name(s->s_root, "self"); + if (self) { + struct inode *inode = new_inode_pseudo(s); + if (inode) { + inode->i_ino = self_inum; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_op = &proc_self_inode_operations; + d_add(self, inode); + } else { + dput(self); + self = ERR_PTR(-ENOMEM); + } + } else { + self = ERR_PTR(-ENOMEM); + } + mutex_unlock(&root_inode->i_mutex); + if (IS_ERR(self)) { + pr_err("proc_fill_super: can't allocate /proc/self\n"); + return PTR_ERR(self); + } + ns->proc_self = self; + return 0; +} - mode = S_IFLNK | S_IRWXUGO; - proc_self_symlink = proc_create("self", mode, NULL, NULL ); - proc_self_symlink->proc_iops = &proc_self_inode_operations; +void __init proc_self_init(void) +{ + proc_alloc_inum(&self_inum); } diff --git a/fs/proc/stat.c b/fs/proc/stat.c index e296572c73ed..1cf86c0e8689 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -184,7 +184,7 @@ static int show_stat(struct seq_file *p, void *v) static int stat_open(struct inode *inode, struct file *file) { - unsigned size = 1024 + 128 * num_possible_cpus(); + size_t size = 1024 + 128 * num_possible_cpus(); char *buf; struct seq_file *m; int res; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3e636d864d56..dbf61f6174f0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -11,6 +11,7 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/mmu_notifier.h> #include <asm/elf.h> #include <asm/uaccess.h> @@ -688,10 +689,58 @@ const struct file_operations proc_tid_smaps_operations = { .release = seq_release_private, }; +/* + * We do not want to have constant page-shift bits sitting in + * pagemap entries and are about to reuse them some time soon. + * + * Here's the "migration strategy": + * 1. when the system boots these bits remain what they are, + * but a warning about future change is printed in log; + * 2. once anyone clears soft-dirty bits via clear_refs file, + * these flag is set to denote, that user is aware of the + * new API and those page-shift bits change their meaning. + * The respective warning is printed in dmesg; + * 3. In a couple of releases we will remove all the mentions + * of page-shift in pagemap entries. + */ + +static bool soft_dirty_cleared __read_mostly; + +enum clear_refs_types { + CLEAR_REFS_ALL = 1, + CLEAR_REFS_ANON, + CLEAR_REFS_MAPPED, + CLEAR_REFS_SOFT_DIRTY, + CLEAR_REFS_LAST, +}; + +struct clear_refs_private { + struct vm_area_struct *vma; + enum clear_refs_types type; +}; + +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * The soft-dirty tracker uses #PF-s to catch writes + * to pages, so write-protect the pte as well. See the + * Documentation/vm/soft-dirty.txt for full description + * of how soft-dirty works. + */ + pte_t ptent = *pte; + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + set_pte_at(vma->vm_mm, addr, pte, ptent); +#endif +} + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->private; + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = cp->vma; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -706,6 +755,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!pte_present(ptent)) continue; + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { + clear_soft_dirty(vma, addr, pte); + continue; + } + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -719,10 +773,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -#define CLEAR_REFS_ALL 1 -#define CLEAR_REFS_ANON 2 -#define CLEAR_REFS_MAPPED 3 - static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -730,7 +780,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, char buffer[PROC_NUMBUF]; struct mm_struct *mm; struct vm_area_struct *vma; - int type; + enum clear_refs_types type; + int itype; int rv; memset(buffer, 0, sizeof(buffer)); @@ -738,23 +789,37 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - rv = kstrtoint(strstrip(buffer), 10, &type); + rv = kstrtoint(strstrip(buffer), 10, &itype); if (rv < 0) return rv; - if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) + type = (enum clear_refs_types)itype; + if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; + + if (type == CLEAR_REFS_SOFT_DIRTY) { + soft_dirty_cleared = true; + pr_warn_once("The pagemap bits 55-60 has changed their meaning! " + "See the linux/Documentation/vm/pagemap.txt for details.\n"); + } + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mm = get_task_mm(task); if (mm) { + struct clear_refs_private cp = { + .type = type, + }; struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range, .mm = mm, + .private = &cp, }; down_read(&mm->mmap_sem); + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_start(mm, 0, -1); for (vma = mm->mmap; vma; vma = vma->vm_next) { - clear_refs_walk.private = vma; + cp.vma = vma; if (is_vm_hugetlb_page(vma)) continue; /* @@ -773,6 +838,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, walk_page_range(vma->vm_start, vma->vm_end, &clear_refs_walk); } + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_end(mm, 0, -1); flush_tlb_mm(mm); up_read(&mm->mmap_sem); mmput(mm); @@ -794,6 +861,7 @@ typedef struct { struct pagemapread { int pos, len; pagemap_entry_t *buffer; + bool v2; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) @@ -807,14 +875,17 @@ struct pagemapread { #define PM_PSHIFT_BITS 6 #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) -#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) +#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) +/* in "new" pagemap pshift bits are occupied with more status bits */ +#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) +#define __PM_SOFT_DIRTY (1LL) #define PM_PRESENT PM_STATUS(4LL) #define PM_SWAP PM_STATUS(2LL) #define PM_FILE PM_STATUS(1LL) -#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) +#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) #define PM_END_OF_BUFFER 1 static inline pagemap_entry_t make_pme(u64 val) @@ -837,7 +908,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, struct pagemapread *pm = walk->private; unsigned long addr; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); for (addr = start; addr < end; addr += PAGE_SIZE) { err = add_to_pagemap(addr, &pme, pm); @@ -847,11 +918,12 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, return err; } -static void pte_to_pagemap_entry(pagemap_entry_t *pme, +static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { u64 frame, flags; struct page *page = NULL; + int flags2 = 0; if (pte_present(pte)) { frame = pte_pfn(pte); @@ -866,19 +938,21 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, if (is_migration_entry(entry)) page = migration_entry_to_page(entry); } else { - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); return; } if (page && !PageAnon(page)) flags |= PM_FILE; + if (pte_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; - *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags); + *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, - pmd_t pmd, int offset) +static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pmd_t pmd, int offset, int pmd_flags2) { /* * Currently pmd for thp is always present because thp can not be @@ -887,13 +961,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, */ if (pmd_present(pmd)) *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } #else -static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, - pmd_t pmd, int offset) +static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pmd_t pmd, int offset, int pmd_flags2) { } #endif @@ -905,17 +979,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct pagemapread *pm = walk->private; pte_t *pte; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { + int pmd_flags2; + + pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); for (; addr != end; addr += PAGE_SIZE) { unsigned long offset; offset = (addr & ~PAGEMAP_WALK_MASK) >> PAGE_SHIFT; - thp_pmd_to_pagemap_entry(&pme, *pmd, offset); + thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -932,7 +1009,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, * and need a new, higher one */ if (vma && (addr >= vma->vm_end)) { vma = find_vma(walk->mm, addr); - pme = make_pme(PM_NOT_PRESENT); + pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* check that 'vma' actually covers this address, @@ -940,7 +1017,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (vma && (vma->vm_start <= addr) && !is_vm_hugetlb_page(vma)) { pte = pte_offset_map(pmd, addr); - pte_to_pagemap_entry(&pme, vma, addr, *pte); + pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); /* unmap before userspace copy */ pte_unmap(pte); } @@ -955,14 +1032,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } #ifdef CONFIG_HUGETLB_PAGE -static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, +static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, pte_t pte, int offset) { if (pte_present(pte)) *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, 0) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* This function walks within one hugetlb entry in the single call */ @@ -976,7 +1053,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, for (; addr != end; addr += PAGE_SIZE) { int offset = (addr & ~hmask) >> PAGE_SHIFT; - huge_pte_to_pagemap_entry(&pme, *pte, offset); + huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); err = add_to_pagemap(addr, &pme, pm); if (err) return err; @@ -1038,6 +1115,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!count) goto out_task; + pm.v2 = soft_dirty_cleared; pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; @@ -1110,9 +1188,18 @@ out: return ret; } +static int pagemap_open(struct inode *inode, struct file *file) +{ + pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " + "to stop being page-shift some time soon. See the " + "linux/Documentation/vm/pagemap.txt for details.\n"); + return 0; +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, + .open = pagemap_open, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 9610ac772d7e..061894625903 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -20,8 +20,7 @@ static int uptime_proc_show(struct seq_file *m, void *v) for_each_possible_cpu(i) idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; - do_posix_clock_monotonic_gettime(&uptime); - monotonic_to_bootbased(&uptime); + get_monotonic_boottime(&uptime); nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); idle.tv_nsec = rem; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index b870f740ab5a..28503172f2e4 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -8,7 +8,7 @@ */ #include <linux/mm.h> -#include <linux/proc_fs.h> +#include <linux/kcore.h> #include <linux/user.h> #include <linux/elf.h> #include <linux/elfcore.h> @@ -20,8 +20,10 @@ #include <linux/init.h> #include <linux/crash_dump.h> #include <linux/list.h> +#include <linux/vmalloc.h> #include <asm/uaccess.h> #include <asm/io.h> +#include "internal.h" /* List representing chunks of contiguous memory areas and their offsets in * vmcore file. @@ -31,6 +33,10 @@ static LIST_HEAD(vmcore_list); /* Stores the pointer to the buffer containing kernel elf core headers. */ static char *elfcorebuf; static size_t elfcorebuf_sz; +static size_t elfcorebuf_sz_orig; + +static char *elfnotes_buf; +static size_t elfnotes_sz; /* Total size of vmcore file. */ static u64 vmcore_size; @@ -117,27 +123,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count, return read; } -/* Maps vmcore file offset to respective physical address in memroy. */ -static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list, - struct vmcore **m_ptr) -{ - struct vmcore *m; - u64 paddr; - - list_for_each_entry(m, vc_list, list) { - u64 start, end; - start = m->offset; - end = m->offset + m->size - 1; - if (offset >= start && offset <= end) { - paddr = m->paddr + offset - start; - *m_ptr = m; - return paddr; - } - } - *m_ptr = NULL; - return 0; -} - /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ @@ -146,8 +131,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, { ssize_t acc = 0, tmp; size_t tsz; - u64 start, nr_bytes; - struct vmcore *curr_m = NULL; + u64 start; + struct vmcore *m = NULL; if (buflen == 0 || *fpos >= vmcore_size) return 0; @@ -158,9 +143,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, /* Read ELF core header */ if (*fpos < elfcorebuf_sz) { - tsz = elfcorebuf_sz - *fpos; - if (buflen < tsz) - tsz = buflen; + tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen); if (copy_to_user(buffer, elfcorebuf + *fpos, tsz)) return -EFAULT; buflen -= tsz; @@ -173,39 +156,161 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, return acc; } - start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m); - if (!curr_m) - return -EINVAL; - - while (buflen) { - tsz = min_t(size_t, buflen, PAGE_SIZE - (start & ~PAGE_MASK)); + /* Read Elf note segment */ + if (*fpos < elfcorebuf_sz + elfnotes_sz) { + void *kaddr; - /* Calculate left bytes in current memory segment. */ - nr_bytes = (curr_m->size - (start - curr_m->paddr)); - if (tsz > nr_bytes) - tsz = nr_bytes; - - tmp = read_from_oldmem(buffer, tsz, &start, 1); - if (tmp < 0) - return tmp; + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); + kaddr = elfnotes_buf + *fpos - elfcorebuf_sz; + if (copy_to_user(buffer, kaddr, tsz)) + return -EFAULT; buflen -= tsz; *fpos += tsz; buffer += tsz; acc += tsz; - if (start >= (curr_m->paddr + curr_m->size)) { - if (curr_m->list.next == &vmcore_list) - return acc; /*EOF*/ - curr_m = list_entry(curr_m->list.next, - struct vmcore, list); - start = curr_m->paddr; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; + } + + list_for_each_entry(m, &vmcore_list, list) { + if (*fpos < m->offset + m->size) { + tsz = min_t(size_t, m->offset + m->size - *fpos, buflen); + start = m->paddr + *fpos - m->offset; + tmp = read_from_oldmem(buffer, tsz, &start, 1); + if (tmp < 0) + return tmp; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; } } + return acc; } +/** + * alloc_elfnotes_buf - allocate buffer for ELF note segment in + * vmalloc memory + * + * @notes_sz: size of buffer + * + * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap + * the buffer to user-space by means of remap_vmalloc_range(). + * + * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is + * disabled and there's no need to allow users to mmap the buffer. + */ +static inline char *alloc_elfnotes_buf(size_t notes_sz) +{ +#ifdef CONFIG_MMU + return vmalloc_user(notes_sz); +#else + return vzalloc(notes_sz); +#endif +} + +/* + * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is + * essential for mmap_vmcore() in order to map physically + * non-contiguous objects (ELF header, ELF note segment and memory + * regions in the 1st kernel pointed to by PT_LOAD entries) into + * virtually contiguous user-space in ELF layout. + */ +#ifdef CONFIG_MMU +static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) +{ + size_t size = vma->vm_end - vma->vm_start; + u64 start, end, len, tsz; + struct vmcore *m; + + start = (u64)vma->vm_pgoff << PAGE_SHIFT; + end = start + size; + + if (size > vmcore_size || end > vmcore_size) + return -EINVAL; + + if (vma->vm_flags & (VM_WRITE | VM_EXEC)) + return -EPERM; + + vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); + vma->vm_flags |= VM_MIXEDMAP; + + len = 0; + + if (start < elfcorebuf_sz) { + u64 pfn; + + tsz = min(elfcorebuf_sz - (size_t)start, size); + pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT; + if (remap_pfn_range(vma, vma->vm_start, pfn, tsz, + vma->vm_page_prot)) + return -EAGAIN; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + + if (start < elfcorebuf_sz + elfnotes_sz) { + void *kaddr; + + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size); + kaddr = elfnotes_buf + start - elfcorebuf_sz; + if (remap_vmalloc_range_partial(vma, vma->vm_start + len, + kaddr, tsz)) + goto fail; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + + list_for_each_entry(m, &vmcore_list, list) { + if (start < m->offset + m->size) { + u64 paddr = 0; + + tsz = min_t(size_t, m->offset + m->size - start, size); + paddr = m->paddr + start - m->offset; + if (remap_pfn_range(vma, vma->vm_start + len, + paddr >> PAGE_SHIFT, tsz, + vma->vm_page_prot)) + goto fail; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + } + + return 0; +fail: + do_munmap(vma->vm_mm, vma->vm_start, len); + return -EAGAIN; +} +#else +static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) +{ + return -ENOSYS; +} +#endif + static const struct file_operations proc_vmcore_operations = { .read = read_vmcore, .llseek = default_llseek, + .mmap = mmap_vmcore, }; static struct vmcore* __init get_new_element(void) @@ -213,61 +318,40 @@ static struct vmcore* __init get_new_element(void) return kzalloc(sizeof(struct vmcore), GFP_KERNEL); } -static u64 __init get_vmcore_size_elf64(char *elfptr) +static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz, + struct list_head *vc_list) { - int i; - u64 size; - Elf64_Ehdr *ehdr_ptr; - Elf64_Phdr *phdr_ptr; - - ehdr_ptr = (Elf64_Ehdr *)elfptr; - phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); - size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr)); - for (i = 0; i < ehdr_ptr->e_phnum; i++) { - size += phdr_ptr->p_memsz; - phdr_ptr++; - } - return size; -} - -static u64 __init get_vmcore_size_elf32(char *elfptr) -{ - int i; u64 size; - Elf32_Ehdr *ehdr_ptr; - Elf32_Phdr *phdr_ptr; + struct vmcore *m; - ehdr_ptr = (Elf32_Ehdr *)elfptr; - phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); - size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr)); - for (i = 0; i < ehdr_ptr->e_phnum; i++) { - size += phdr_ptr->p_memsz; - phdr_ptr++; + size = elfsz + elfnotesegsz; + list_for_each_entry(m, vc_list, list) { + size += m->size; } return size; } -/* Merges all the PT_NOTE headers into one. */ -static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, - struct list_head *vc_list) +/** + * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry + * + * @ehdr_ptr: ELF header + * + * This function updates p_memsz member of each PT_NOTE entry in the + * program header table pointed to by @ehdr_ptr to real size of ELF + * note segment. + */ +static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) { - int i, nr_ptnote=0, rc=0; - char *tmp; - Elf64_Ehdr *ehdr_ptr; - Elf64_Phdr phdr, *phdr_ptr; + int i, rc=0; + Elf64_Phdr *phdr_ptr; Elf64_Nhdr *nhdr_ptr; - u64 phdr_sz = 0, note_off; - ehdr_ptr = (Elf64_Ehdr *)elfptr; - phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); + phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1); for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { - int j; void *notes_section; - struct vmcore *new; u64 offset, max_sz, sz, real_sz = 0; if (phdr_ptr->p_type != PT_NOTE) continue; - nr_ptnote++; max_sz = phdr_ptr->p_memsz; offset = phdr_ptr->p_offset; notes_section = kmalloc(max_sz, GFP_KERNEL); @@ -279,7 +363,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, return rc; } nhdr_ptr = notes_section; - for (j = 0; j < max_sz; j += sz) { + while (real_sz < max_sz) { if (nhdr_ptr->n_namesz == 0) break; sz = sizeof(Elf64_Nhdr) + @@ -288,26 +372,122 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, real_sz += sz; nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); } - - /* Add this contiguous chunk of notes section to vmcore list.*/ - new = get_new_element(); - if (!new) { - kfree(notes_section); - return -ENOMEM; - } - new->paddr = phdr_ptr->p_offset; - new->size = real_sz; - list_add_tail(&new->list, vc_list); - phdr_sz += real_sz; kfree(notes_section); + phdr_ptr->p_memsz = real_sz; + } + + return 0; +} + +/** + * get_note_number_and_size_elf64 - get the number of PT_NOTE program + * headers and sum of real size of their ELF note segment headers and + * data. + * + * @ehdr_ptr: ELF header + * @nr_ptnote: buffer for the number of PT_NOTE program headers + * @sz_ptnote: buffer for size of unique PT_NOTE program header + * + * This function is used to merge multiple PT_NOTE program headers + * into a unique single one. The resulting unique entry will have + * @sz_ptnote in its phdr->p_mem. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf64 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr, + int *nr_ptnote, u64 *sz_ptnote) +{ + int i; + Elf64_Phdr *phdr_ptr; + + *nr_ptnote = *sz_ptnote = 0; + + phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_NOTE) + continue; + *nr_ptnote += 1; + *sz_ptnote += phdr_ptr->p_memsz; + } + + return 0; +} + +/** + * copy_notes_elf64 - copy ELF note segments in a given buffer + * + * @ehdr_ptr: ELF header + * @notes_buf: buffer into which ELF note segments are copied + * + * This function is used to copy ELF note segment in the 1st kernel + * into the buffer @notes_buf in the 2nd kernel. It is assumed that + * size of the buffer @notes_buf is equal to or larger than sum of the + * real ELF note segment headers and data. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf64 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf) +{ + int i, rc=0; + Elf64_Phdr *phdr_ptr; + + phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1); + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 offset; + if (phdr_ptr->p_type != PT_NOTE) + continue; + offset = phdr_ptr->p_offset; + rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0); + if (rc < 0) + return rc; + notes_buf += phdr_ptr->p_memsz; } + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, + char **notes_buf, size_t *notes_sz) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr phdr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + + rc = update_note_header_size_elf64(ehdr_ptr); + if (rc < 0) + return rc; + + rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz); + if (rc < 0) + return rc; + + *notes_sz = roundup(phdr_sz, PAGE_SIZE); + *notes_buf = alloc_elfnotes_buf(*notes_sz); + if (!*notes_buf) + return -ENOMEM; + + rc = copy_notes_elf64(ehdr_ptr, *notes_buf); + if (rc < 0) + return rc; + /* Prepare merged PT_NOTE program header. */ phdr.p_type = PT_NOTE; phdr.p_flags = 0; note_off = sizeof(Elf64_Ehdr) + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr); - phdr.p_offset = note_off; + phdr.p_offset = roundup(note_off, PAGE_SIZE); phdr.p_vaddr = phdr.p_paddr = 0; phdr.p_filesz = phdr.p_memsz = phdr_sz; phdr.p_align = 0; @@ -321,6 +501,8 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, i = (nr_ptnote - 1) * sizeof(Elf64_Phdr); *elfsz = *elfsz - i; memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr))); + memset(elfptr + *elfsz, 0, i); + *elfsz = roundup(*elfsz, PAGE_SIZE); /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; @@ -328,27 +510,27 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, return 0; } -/* Merges all the PT_NOTE headers into one. */ -static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, - struct list_head *vc_list) +/** + * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry + * + * @ehdr_ptr: ELF header + * + * This function updates p_memsz member of each PT_NOTE entry in the + * program header table pointed to by @ehdr_ptr to real size of ELF + * note segment. + */ +static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) { - int i, nr_ptnote=0, rc=0; - char *tmp; - Elf32_Ehdr *ehdr_ptr; - Elf32_Phdr phdr, *phdr_ptr; + int i, rc=0; + Elf32_Phdr *phdr_ptr; Elf32_Nhdr *nhdr_ptr; - u64 phdr_sz = 0, note_off; - ehdr_ptr = (Elf32_Ehdr *)elfptr; - phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); + phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1); for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { - int j; void *notes_section; - struct vmcore *new; u64 offset, max_sz, sz, real_sz = 0; if (phdr_ptr->p_type != PT_NOTE) continue; - nr_ptnote++; max_sz = phdr_ptr->p_memsz; offset = phdr_ptr->p_offset; notes_section = kmalloc(max_sz, GFP_KERNEL); @@ -360,7 +542,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, return rc; } nhdr_ptr = notes_section; - for (j = 0; j < max_sz; j += sz) { + while (real_sz < max_sz) { if (nhdr_ptr->n_namesz == 0) break; sz = sizeof(Elf32_Nhdr) + @@ -369,26 +551,122 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, real_sz += sz; nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); } - - /* Add this contiguous chunk of notes section to vmcore list.*/ - new = get_new_element(); - if (!new) { - kfree(notes_section); - return -ENOMEM; - } - new->paddr = phdr_ptr->p_offset; - new->size = real_sz; - list_add_tail(&new->list, vc_list); - phdr_sz += real_sz; kfree(notes_section); + phdr_ptr->p_memsz = real_sz; + } + + return 0; +} + +/** + * get_note_number_and_size_elf32 - get the number of PT_NOTE program + * headers and sum of real size of their ELF note segment headers and + * data. + * + * @ehdr_ptr: ELF header + * @nr_ptnote: buffer for the number of PT_NOTE program headers + * @sz_ptnote: buffer for size of unique PT_NOTE program header + * + * This function is used to merge multiple PT_NOTE program headers + * into a unique single one. The resulting unique entry will have + * @sz_ptnote in its phdr->p_mem. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf32 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr, + int *nr_ptnote, u64 *sz_ptnote) +{ + int i; + Elf32_Phdr *phdr_ptr; + + *nr_ptnote = *sz_ptnote = 0; + + phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_NOTE) + continue; + *nr_ptnote += 1; + *sz_ptnote += phdr_ptr->p_memsz; + } + + return 0; +} + +/** + * copy_notes_elf32 - copy ELF note segments in a given buffer + * + * @ehdr_ptr: ELF header + * @notes_buf: buffer into which ELF note segments are copied + * + * This function is used to copy ELF note segment in the 1st kernel + * into the buffer @notes_buf in the 2nd kernel. It is assumed that + * size of the buffer @notes_buf is equal to or larger than sum of the + * real ELF note segment headers and data. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf32 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf) +{ + int i, rc=0; + Elf32_Phdr *phdr_ptr; + + phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1); + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 offset; + if (phdr_ptr->p_type != PT_NOTE) + continue; + offset = phdr_ptr->p_offset; + rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0); + if (rc < 0) + return rc; + notes_buf += phdr_ptr->p_memsz; } + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, + char **notes_buf, size_t *notes_sz) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr phdr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + + rc = update_note_header_size_elf32(ehdr_ptr); + if (rc < 0) + return rc; + + rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz); + if (rc < 0) + return rc; + + *notes_sz = roundup(phdr_sz, PAGE_SIZE); + *notes_buf = alloc_elfnotes_buf(*notes_sz); + if (!*notes_buf) + return -ENOMEM; + + rc = copy_notes_elf32(ehdr_ptr, *notes_buf); + if (rc < 0) + return rc; + /* Prepare merged PT_NOTE program header. */ phdr.p_type = PT_NOTE; phdr.p_flags = 0; note_off = sizeof(Elf32_Ehdr) + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr); - phdr.p_offset = note_off; + phdr.p_offset = roundup(note_off, PAGE_SIZE); phdr.p_vaddr = phdr.p_paddr = 0; phdr.p_filesz = phdr.p_memsz = phdr_sz; phdr.p_align = 0; @@ -402,6 +680,8 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, i = (nr_ptnote - 1) * sizeof(Elf32_Phdr); *elfsz = *elfsz - i; memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr))); + memset(elfptr + *elfsz, 0, i); + *elfsz = roundup(*elfsz, PAGE_SIZE); /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; @@ -413,6 +693,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, * the new offset fields of exported program headers. */ static int __init process_ptload_program_headers_elf64(char *elfptr, size_t elfsz, + size_t elfnotes_sz, struct list_head *vc_list) { int i; @@ -424,32 +705,38 @@ static int __init process_ptload_program_headers_elf64(char *elfptr, ehdr_ptr = (Elf64_Ehdr *)elfptr; phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ - /* First program header is PT_NOTE header. */ - vmcore_off = sizeof(Elf64_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) + - phdr_ptr->p_memsz; /* Note sections */ + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 paddr, start, end, size; + if (phdr_ptr->p_type != PT_LOAD) continue; + paddr = phdr_ptr->p_offset; + start = rounddown(paddr, PAGE_SIZE); + end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); + size = end - start; + /* Add this contiguous chunk of memory to vmcore list.*/ new = get_new_element(); if (!new) return -ENOMEM; - new->paddr = phdr_ptr->p_offset; - new->size = phdr_ptr->p_memsz; + new->paddr = start; + new->size = size; list_add_tail(&new->list, vc_list); /* Update the program header offset. */ - phdr_ptr->p_offset = vmcore_off; - vmcore_off = vmcore_off + phdr_ptr->p_memsz; + phdr_ptr->p_offset = vmcore_off + (paddr - start); + vmcore_off = vmcore_off + size; } return 0; } static int __init process_ptload_program_headers_elf32(char *elfptr, size_t elfsz, + size_t elfnotes_sz, struct list_head *vc_list) { int i; @@ -461,43 +748,44 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, ehdr_ptr = (Elf32_Ehdr *)elfptr; phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ - /* First program header is PT_NOTE header. */ - vmcore_off = sizeof(Elf32_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) + - phdr_ptr->p_memsz; /* Note sections */ + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 paddr, start, end, size; + if (phdr_ptr->p_type != PT_LOAD) continue; + paddr = phdr_ptr->p_offset; + start = rounddown(paddr, PAGE_SIZE); + end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); + size = end - start; + /* Add this contiguous chunk of memory to vmcore list.*/ new = get_new_element(); if (!new) return -ENOMEM; - new->paddr = phdr_ptr->p_offset; - new->size = phdr_ptr->p_memsz; + new->paddr = start; + new->size = size; list_add_tail(&new->list, vc_list); /* Update the program header offset */ - phdr_ptr->p_offset = vmcore_off; - vmcore_off = vmcore_off + phdr_ptr->p_memsz; + phdr_ptr->p_offset = vmcore_off + (paddr - start); + vmcore_off = vmcore_off + size; } return 0; } /* Sets offset fields of vmcore elements. */ -static void __init set_vmcore_list_offsets_elf64(char *elfptr, - struct list_head *vc_list) +static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, + struct list_head *vc_list) { loff_t vmcore_off; - Elf64_Ehdr *ehdr_ptr; struct vmcore *m; - ehdr_ptr = (Elf64_Ehdr *)elfptr; - - /* Skip Elf header and program headers. */ - vmcore_off = sizeof(Elf64_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr); + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; list_for_each_entry(m, vc_list, list) { m->offset = vmcore_off; @@ -505,24 +793,12 @@ static void __init set_vmcore_list_offsets_elf64(char *elfptr, } } -/* Sets offset fields of vmcore elements. */ -static void __init set_vmcore_list_offsets_elf32(char *elfptr, - struct list_head *vc_list) +static void free_elfcorebuf(void) { - loff_t vmcore_off; - Elf32_Ehdr *ehdr_ptr; - struct vmcore *m; - - ehdr_ptr = (Elf32_Ehdr *)elfptr; - - /* Skip Elf header and program headers. */ - vmcore_off = sizeof(Elf32_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr); - - list_for_each_entry(m, vc_list, list) { - m->offset = vmcore_off; - vmcore_off += m->size; - } + free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig)); + elfcorebuf = NULL; + vfree(elfnotes_buf); + elfnotes_buf = NULL; } static int __init parse_crash_elf64_headers(void) @@ -553,31 +829,32 @@ static int __init parse_crash_elf64_headers(void) } /* Read in all elf headers. */ - elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr); - elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); + elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) + + ehdr.e_phnum * sizeof(Elf64_Phdr); + elfcorebuf_sz = elfcorebuf_sz_orig; + elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(elfcorebuf_sz_orig)); if (!elfcorebuf) return -ENOMEM; addr = elfcorehdr_addr; - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); - if (rc < 0) { - kfree(elfcorebuf); - return rc; - } + rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0); + if (rc < 0) + goto fail; /* Merge all PT_NOTE headers into one. */ - rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } + rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, + &elfnotes_buf, &elfnotes_sz); + if (rc) + goto fail; rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz, - &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } - set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list); + elfnotes_sz, &vmcore_list); + if (rc) + goto fail; + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); return 0; +fail: + free_elfcorebuf(); + return rc; } static int __init parse_crash_elf32_headers(void) @@ -608,31 +885,31 @@ static int __init parse_crash_elf32_headers(void) } /* Read in all elf headers. */ - elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); - elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); + elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); + elfcorebuf_sz = elfcorebuf_sz_orig; + elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(elfcorebuf_sz_orig)); if (!elfcorebuf) return -ENOMEM; addr = elfcorehdr_addr; - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); - if (rc < 0) { - kfree(elfcorebuf); - return rc; - } + rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0); + if (rc < 0) + goto fail; /* Merge all PT_NOTE headers into one. */ - rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } + rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, + &elfnotes_buf, &elfnotes_sz); + if (rc) + goto fail; rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz, - &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } - set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list); + elfnotes_sz, &vmcore_list); + if (rc) + goto fail; + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); return 0; +fail: + free_elfcorebuf(); + return rc; } static int __init parse_crash_elf_headers(void) @@ -654,20 +931,19 @@ static int __init parse_crash_elf_headers(void) rc = parse_crash_elf64_headers(); if (rc) return rc; - - /* Determine vmcore size. */ - vmcore_size = get_vmcore_size_elf64(elfcorebuf); } else if (e_ident[EI_CLASS] == ELFCLASS32) { rc = parse_crash_elf32_headers(); if (rc) return rc; - - /* Determine vmcore size. */ - vmcore_size = get_vmcore_size_elf32(elfcorebuf); } else { pr_warn("Warning: Core image elf header is not sane\n"); return -EINVAL; } + + /* Determine vmcore size. */ + vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz, + &vmcore_list); + return 0; } @@ -698,7 +974,7 @@ void vmcore_cleanup(void) struct list_head *pos, *next; if (proc_vmcore) { - remove_proc_entry(proc_vmcore->name, proc_vmcore->parent); + proc_remove(proc_vmcore); proc_vmcore = NULL; } @@ -710,7 +986,6 @@ void vmcore_cleanup(void) list_del(&m->list); kfree(m); } - kfree(elfcorebuf); - elfcorebuf = NULL; + free_elfcorebuf(); } EXPORT_SYMBOL_GPL(vmcore_cleanup); diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 43b12807a51d..76a4eeb92982 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -44,7 +44,7 @@ static void notrace pstore_ftrace_call(unsigned long ip, rec.parent_ip = parent_ip; pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id()); psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec, - sizeof(rec), psinfo); + 0, sizeof(rec), psinfo); local_irq_restore(flags); } diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index e4bcb2cf055a..71bf5f4ae84c 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -178,6 +178,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) if (p->psi->erase) p->psi->erase(p->type, p->id, p->count, dentry->d_inode->i_ctime, p->psi); + else + return -EPERM; return simple_unlink(dir, dentry); } @@ -324,6 +326,15 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_MCE: sprintf(name, "mce-%s-%lld", psname, id); break; + case PSTORE_TYPE_PPC_RTAS: + sprintf(name, "rtas-%s-%lld", psname, id); + break; + case PSTORE_TYPE_PPC_OF: + sprintf(name, "powerpc-ofw-%s-%lld", psname, id); + break; + case PSTORE_TYPE_PPC_COMMON: + sprintf(name, "powerpc-common-%s-%lld", psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, "unknown-%s-%lld", psname, id); break; diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 86d1038b5a12..422962ae9fc2 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -159,7 +159,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, break; ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, - oopscount, hsize + len, psinfo); + oopscount, hsize, hsize + len, psinfo); if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_new_entry = 1; @@ -196,7 +196,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) spin_lock_irqsave(&psinfo->buf_lock, flags); } memcpy(psinfo->buf, s, c); - psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo); + psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo); spin_unlock_irqrestore(&psinfo->buf_lock, flags); s += c; c = e - s; @@ -221,9 +221,11 @@ static void pstore_register_console(void) {} static int pstore_write_compat(enum pstore_type_id type, enum kmsg_dump_reason reason, u64 *id, unsigned int part, int count, - size_t size, struct pstore_info *psi) + size_t hsize, size_t size, + struct pstore_info *psi) { - return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi); + return psi->write_buf(type, reason, id, part, psinfo->buf, hsize, + size, psi); } /* @@ -239,17 +241,15 @@ int pstore_register(struct pstore_info *psi) { struct module *owner = psi->owner; + if (backend && strcmp(backend, psi->name)) + return -EPERM; + spin_lock(&pstore_lock); if (psinfo) { spin_unlock(&pstore_lock); return -EBUSY; } - if (backend && strcmp(backend, psi->name)) { - spin_unlock(&pstore_lock); - return -EINVAL; - } - if (!psi->write) psi->write = pstore_write_compat; psinfo = psi; @@ -274,6 +274,9 @@ int pstore_register(struct pstore_info *psi) add_timer(&pstore_timer); } + pr_info("pstore: Registered %s as persistent store backend\n", + psi->name); + return 0; } EXPORT_SYMBOL_GPL(pstore_register); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 288f068740f6..a6119f9469e2 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -83,7 +83,7 @@ struct ramoops_context { size_t console_size; size_t ftrace_size; int dump_oops; - int ecc_size; + struct persistent_ram_ecc_info ecc_info; unsigned int max_dump_cnt; unsigned int dump_write_cnt; unsigned int dump_read_cnt; @@ -136,6 +136,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, char **buf, struct pstore_info *psi) { ssize_t size; + ssize_t ecc_notice_size; struct ramoops_context *cxt = psi->data; struct persistent_ram_zone *prz; @@ -156,12 +157,18 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, time->tv_nsec = 0; size = persistent_ram_old_size(prz); - *buf = kmalloc(size, GFP_KERNEL); + + /* ECC correction notice */ + ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); + + *buf = kmalloc(size + ecc_notice_size + 1, GFP_KERNEL); if (*buf == NULL) return -ENOMEM; + memcpy(*buf, persistent_ram_old(prz), size); + persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1); - return size; + return size + ecc_notice_size; } static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) @@ -188,7 +195,8 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, enum kmsg_dump_reason reason, u64 *id, unsigned int part, - const char *buf, size_t size, + const char *buf, + size_t hsize, size_t size, struct pstore_info *psi) { struct ramoops_context *cxt = psi->data; @@ -323,7 +331,8 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, for (i = 0; i < cxt->max_dump_cnt; i++) { size_t sz = cxt->record_size; - cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, cxt->ecc_size); + cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, + &cxt->ecc_info); if (IS_ERR(cxt->przs[i])) { err = PTR_ERR(cxt->przs[i]); dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", @@ -353,7 +362,7 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, return -ENOMEM; } - *prz = persistent_ram_new(*paddr, sz, sig, cxt->ecc_size); + *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info); if (IS_ERR(*prz)) { int err = PTR_ERR(*prz); @@ -391,8 +400,6 @@ static int ramoops_probe(struct platform_device *pdev) goto fail_out; } - if (!is_power_of_2(pdata->mem_size)) - pdata->mem_size = rounddown_pow_of_two(pdata->mem_size); if (!is_power_of_2(pdata->record_size)) pdata->record_size = rounddown_pow_of_two(pdata->record_size); if (!is_power_of_2(pdata->console_size)) @@ -407,7 +414,7 @@ static int ramoops_probe(struct platform_device *pdev) cxt->console_size = pdata->console_size; cxt->ftrace_size = pdata->ftrace_size; cxt->dump_oops = pdata->dump_oops; - cxt->ecc_size = pdata->ecc_size; + cxt->ecc_info = pdata->ecc_info; paddr = cxt->phys_addr; @@ -430,6 +437,7 @@ static int ramoops_probe(struct platform_device *pdev) pr_err("memory size too small, minimum is %zu\n", cxt->console_size + cxt->record_size + cxt->ftrace_size); + err = -EINVAL; goto fail_cnt; } @@ -447,6 +455,7 @@ static int ramoops_probe(struct platform_device *pdev) spin_lock_init(&cxt->pstore.buf_lock); if (!cxt->pstore.buf) { pr_err("cannot allocate pstore buffer\n"); + err = -ENOMEM; goto fail_clear; } @@ -465,9 +474,9 @@ static int ramoops_probe(struct platform_device *pdev) record_size = pdata->record_size; dump_oops = pdata->dump_oops; - pr_info("attached 0x%lx@0x%llx, ecc: %d\n", + pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n", cxt->size, (unsigned long long)cxt->phys_addr, - cxt->ecc_size); + cxt->ecc_info.ecc_size, cxt->ecc_info.block_size); return 0; @@ -539,7 +548,7 @@ static void ramoops_register_dummy(void) * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC * (using 1 byte for ECC isn't much of use anyway). */ - dummy_data->ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc; + dummy_data->ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc; dummy = platform_device_register_data(NULL, "ramoops", -1, dummy_data, sizeof(struct ramoops_platform_data)); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 0306303be372..de272d426763 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -46,7 +46,7 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz) } /* increase and wrap the start pointer, returning the old value */ -static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) +static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a) { int old; int new; @@ -62,7 +62,7 @@ static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) } /* increase the size counter until it hits the max size */ -static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a) +static void buffer_size_add_atomic(struct persistent_ram_zone *prz, size_t a) { size_t old; size_t new; @@ -78,16 +78,63 @@ static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a) } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old); } +static DEFINE_RAW_SPINLOCK(buffer_lock); + +/* increase and wrap the start pointer, returning the old value */ +static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a) +{ + int old; + int new; + unsigned long flags; + + raw_spin_lock_irqsave(&buffer_lock, flags); + + old = atomic_read(&prz->buffer->start); + new = old + a; + while (unlikely(new > prz->buffer_size)) + new -= prz->buffer_size; + atomic_set(&prz->buffer->start, new); + + raw_spin_unlock_irqrestore(&buffer_lock, flags); + + return old; +} + +/* increase the size counter until it hits the max size */ +static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a) +{ + size_t old; + size_t new; + unsigned long flags; + + raw_spin_lock_irqsave(&buffer_lock, flags); + + old = atomic_read(&prz->buffer->size); + if (old == prz->buffer_size) + goto exit; + + new = old + a; + if (new > prz->buffer_size) + new = prz->buffer_size; + atomic_set(&prz->buffer->size, new); + +exit: + raw_spin_unlock_irqrestore(&buffer_lock, flags); +} + +static size_t (*buffer_start_add)(struct persistent_ram_zone *, size_t) = buffer_start_add_atomic; +static void (*buffer_size_add)(struct persistent_ram_zone *, size_t) = buffer_size_add_atomic; + static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz, uint8_t *data, size_t len, uint8_t *ecc) { int i; - uint16_t par[prz->ecc_size]; + uint16_t par[prz->ecc_info.ecc_size]; /* Initialize the parity buffer */ memset(par, 0, sizeof(par)); encode_rs8(prz->rs_decoder, data, len, par, 0); - for (i = 0; i < prz->ecc_size; i++) + for (i = 0; i < prz->ecc_info.ecc_size; i++) ecc[i] = par[i]; } @@ -95,9 +142,9 @@ static int persistent_ram_decode_rs8(struct persistent_ram_zone *prz, void *data, size_t len, uint8_t *ecc) { int i; - uint16_t par[prz->ecc_size]; + uint16_t par[prz->ecc_info.ecc_size]; - for (i = 0; i < prz->ecc_size; i++) + for (i = 0; i < prz->ecc_info.ecc_size; i++) par[i] = ecc[i]; return decode_rs8(prz->rs_decoder, data, par, len, NULL, 0, NULL, 0, NULL); @@ -110,15 +157,15 @@ static void notrace persistent_ram_update_ecc(struct persistent_ram_zone *prz, uint8_t *buffer_end = buffer->data + prz->buffer_size; uint8_t *block; uint8_t *par; - int ecc_block_size = prz->ecc_block_size; - int ecc_size = prz->ecc_size; - int size = prz->ecc_block_size; + int ecc_block_size = prz->ecc_info.block_size; + int ecc_size = prz->ecc_info.ecc_size; + int size = ecc_block_size; - if (!prz->ecc_size) + if (!ecc_size) return; block = buffer->data + (start & ~(ecc_block_size - 1)); - par = prz->par_buffer + (start / ecc_block_size) * prz->ecc_size; + par = prz->par_buffer + (start / ecc_block_size) * ecc_size; do { if (block + ecc_block_size > buffer_end) @@ -133,7 +180,7 @@ static void persistent_ram_update_header_ecc(struct persistent_ram_zone *prz) { struct persistent_ram_buffer *buffer = prz->buffer; - if (!prz->ecc_size) + if (!prz->ecc_info.ecc_size) return; persistent_ram_encode_rs8(prz, (uint8_t *)buffer, sizeof(*buffer), @@ -146,14 +193,14 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz) uint8_t *block; uint8_t *par; - if (!prz->ecc_size) + if (!prz->ecc_info.ecc_size) return; block = buffer->data; par = prz->par_buffer; while (block < buffer->data + buffer_size(prz)) { int numerr; - int size = prz->ecc_block_size; + int size = prz->ecc_info.block_size; if (block + size > buffer->data + prz->buffer_size) size = buffer->data + prz->buffer_size - block; numerr = persistent_ram_decode_rs8(prz, block, size, par); @@ -166,44 +213,49 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz) block); prz->bad_blocks++; } - block += prz->ecc_block_size; - par += prz->ecc_size; + block += prz->ecc_info.block_size; + par += prz->ecc_info.ecc_size; } } static int persistent_ram_init_ecc(struct persistent_ram_zone *prz, - int ecc_size) + struct persistent_ram_ecc_info *ecc_info) { int numerr; struct persistent_ram_buffer *buffer = prz->buffer; int ecc_blocks; size_t ecc_total; - int ecc_symsize = 8; - int ecc_poly = 0x11d; - if (!ecc_size) + if (!ecc_info || !ecc_info->ecc_size) return 0; - prz->ecc_block_size = 128; - prz->ecc_size = ecc_size; + prz->ecc_info.block_size = ecc_info->block_size ?: 128; + prz->ecc_info.ecc_size = ecc_info->ecc_size ?: 16; + prz->ecc_info.symsize = ecc_info->symsize ?: 8; + prz->ecc_info.poly = ecc_info->poly ?: 0x11d; - ecc_blocks = DIV_ROUND_UP(prz->buffer_size, prz->ecc_block_size); - ecc_total = (ecc_blocks + 1) * prz->ecc_size; + ecc_blocks = DIV_ROUND_UP(prz->buffer_size - prz->ecc_info.ecc_size, + prz->ecc_info.block_size + + prz->ecc_info.ecc_size); + ecc_total = (ecc_blocks + 1) * prz->ecc_info.ecc_size; if (ecc_total >= prz->buffer_size) { pr_err("%s: invalid ecc_size %u (total %zu, buffer size %zu)\n", - __func__, prz->ecc_size, ecc_total, prz->buffer_size); + __func__, prz->ecc_info.ecc_size, + ecc_total, prz->buffer_size); return -EINVAL; } prz->buffer_size -= ecc_total; prz->par_buffer = buffer->data + prz->buffer_size; - prz->par_header = prz->par_buffer + ecc_blocks * prz->ecc_size; + prz->par_header = prz->par_buffer + + ecc_blocks * prz->ecc_info.ecc_size; /* * first consecutive root is 0 * primitive element to generate roots = 1 */ - prz->rs_decoder = init_rs(ecc_symsize, ecc_poly, 0, 1, prz->ecc_size); + prz->rs_decoder = init_rs(prz->ecc_info.symsize, prz->ecc_info.poly, + 0, 1, prz->ecc_info.ecc_size); if (prz->rs_decoder == NULL) { pr_info("persistent_ram: init_rs failed\n"); return -EINVAL; @@ -230,6 +282,9 @@ ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz, { ssize_t ret; + if (!prz->ecc_info.ecc_size) + return 0; + if (prz->corrected_bytes || prz->bad_blocks) ret = snprintf(str, len, "" "\n%d Corrected bytes, %d unrecoverable blocks\n", @@ -364,6 +419,9 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size) return NULL; } + buffer_start_add = buffer_start_add_locked; + buffer_size_add = buffer_size_add_locked; + return ioremap(start, size); } @@ -391,11 +449,11 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, } static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, - int ecc_size) + struct persistent_ram_ecc_info *ecc_info) { int ret; - ret = persistent_ram_init_ecc(prz, ecc_size); + ret = persistent_ram_init_ecc(prz, ecc_info); if (ret) return ret; @@ -444,7 +502,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz) } struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, - u32 sig, int ecc_size) + u32 sig, struct persistent_ram_ecc_info *ecc_info) { struct persistent_ram_zone *prz; int ret = -ENOMEM; @@ -459,7 +517,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, if (ret) goto err; - ret = persistent_ram_post_init(prz, sig, ecc_size); + ret = persistent_ram_post_init(prz, sig, ecc_info); if (ret) goto err; diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index 28ce014b3cef..b218f965817b 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -14,9 +14,9 @@ #include <linux/buffer_head.h> #include "qnx4.h" -static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int qnx4_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); unsigned int offset; struct buffer_head *bh; struct qnx4_inode_entry *de; @@ -26,48 +26,44 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir) int size; QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size)); - QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos)); + QNX4DEBUG((KERN_INFO "pos = %ld\n", (long) ctx->pos)); - while (filp->f_pos < inode->i_size) { - blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS ); + while (ctx->pos < inode->i_size) { + blknum = qnx4_block_map(inode, ctx->pos >> QNX4_BLOCK_SIZE_BITS); bh = sb_bread(inode->i_sb, blknum); - if(bh==NULL) { + if (bh == NULL) { printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum); - break; + return 0; } - ix = (int)(filp->f_pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK; - while (ix < QNX4_INODES_PER_BLOCK) { + ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK; + for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) { offset = ix * QNX4_DIR_ENTRY_SIZE; de = (struct qnx4_inode_entry *) (bh->b_data + offset); - size = strlen(de->di_fname); - if (size) { - if ( !( de->di_status & QNX4_FILE_LINK ) && size > QNX4_SHORT_NAME_MAX ) - size = QNX4_SHORT_NAME_MAX; - else if ( size > QNX4_NAME_MAX ) - size = QNX4_NAME_MAX; - - if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) { - QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname)); - if ( ( de->di_status & QNX4_FILE_LINK ) == 0 ) - ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1; - else { - le = (struct qnx4_link_info*)de; - ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) * - QNX4_INODES_PER_BLOCK + - le->dl_inode_ndx; - } - if (filldir(dirent, de->di_fname, size, filp->f_pos, ino, DT_UNKNOWN) < 0) { - brelse(bh); - goto out; - } - } + if (!de->di_fname[0]) + continue; + if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK))) + continue; + if (!(de->di_status & QNX4_FILE_LINK)) + size = QNX4_SHORT_NAME_MAX; + else + size = QNX4_NAME_MAX; + size = strnlen(de->di_fname, size); + QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname)); + if (!(de->di_status & QNX4_FILE_LINK)) + ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1; + else { + le = (struct qnx4_link_info*)de; + ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) * + QNX4_INODES_PER_BLOCK + + le->dl_inode_ndx; + } + if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) { + brelse(bh); + return 0; } - ix++; - filp->f_pos += QNX4_DIR_ENTRY_SIZE; } brelse(bh); } -out: return 0; } @@ -75,7 +71,7 @@ const struct file_operations qnx4_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = qnx4_readdir, + .iterate = qnx4_readdir, .fsync = generic_file_fsync, }; diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 8798d065e400..15b7d92ed60d 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -65,8 +65,8 @@ static struct qnx6_long_filename *qnx6_longname(struct super_block *sb, static int qnx6_dir_longfilename(struct inode *inode, struct qnx6_long_dir_entry *de, - void *dirent, loff_t pos, - unsigned de_inode, filldir_t filldir) + struct dir_context *ctx, + unsigned de_inode) { struct qnx6_long_filename *lf; struct super_block *s = inode->i_sb; @@ -104,8 +104,7 @@ static int qnx6_dir_longfilename(struct inode *inode, QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n", lf_size, lf->lf_fname, de_inode)); - if (filldir(dirent, lf->lf_fname, lf_size, pos, de_inode, - DT_UNKNOWN) < 0) { + if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) { qnx6_put_page(page); return 0; } @@ -115,18 +114,19 @@ static int qnx6_dir_longfilename(struct inode *inode, return 1; } -static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int qnx6_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *s = inode->i_sb; struct qnx6_sb_info *sbi = QNX6_SB(s); - loff_t pos = filp->f_pos & (QNX6_DIR_ENTRY_SIZE - 1); + loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1); unsigned long npages = dir_pages(inode); unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE; bool done = false; - if (filp->f_pos >= inode->i_size) + ctx->pos = pos; + if (ctx->pos >= inode->i_size) return 0; for ( ; !done && n < npages; n++, start = 0) { @@ -137,11 +137,11 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) if (IS_ERR(page)) { printk(KERN_ERR "qnx6_readdir: read failed\n"); - filp->f_pos = (n + 1) << PAGE_CACHE_SHIFT; + ctx->pos = (n + 1) << PAGE_CACHE_SHIFT; return PTR_ERR(page); } de = ((struct qnx6_dir_entry *)page_address(page)) + start; - for (; i < limit; i++, de++, pos += QNX6_DIR_ENTRY_SIZE) { + for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) { int size = de->de_size; u32 no_inode = fs32_to_cpu(sbi, de->de_inode); @@ -154,8 +154,7 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) structure / block */ if (!qnx6_dir_longfilename(inode, (struct qnx6_long_dir_entry *)de, - dirent, pos, no_inode, - filldir)) { + ctx, no_inode)) { done = true; break; } @@ -163,9 +162,8 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s" " inode:%u\n", size, de->de_fname, no_inode)); - if (filldir(dirent, de->de_fname, size, - pos, no_inode, DT_UNKNOWN) - < 0) { + if (!dir_emit(ctx, de->de_fname, size, + no_inode, DT_UNKNOWN)) { done = true; break; } @@ -173,7 +171,6 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) } qnx6_put_page(page); } - filp->f_pos = pos; return 0; } @@ -282,7 +279,7 @@ found: const struct file_operations qnx6_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = qnx6_readdir, + .iterate = qnx6_readdir, .fsync = generic_file_fsync, }; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 3e64169ef527..fbad622841f9 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2585,7 +2585,7 @@ static int do_proc_dqstats(struct ctl_table *table, int write, return proc_dointvec(table, write, buffer, lenp, ppos); } -static ctl_table fs_dqstats_table[] = { +static struct ctl_table fs_dqstats_table[] = { { .procname = "lookups", .data = &dqstats.stat[DQST_LOOKUPS], @@ -2654,7 +2654,7 @@ static ctl_table fs_dqstats_table[] = { { }, }; -static ctl_table fs_table[] = { +static struct ctl_table fs_table[] = { { .procname = "quota", .mode = 0555, @@ -2663,7 +2663,7 @@ static ctl_table fs_table[] = { { }, }; -static ctl_table sys_table[] = { +static struct ctl_table sys_table[] = { { .procname = "fs", .mode = 0555, diff --git a/fs/read_write.c b/fs/read_write.c index e6ddc8dceb96..122a3846d9e1 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -9,6 +9,7 @@ #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> +#include <linux/aio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> @@ -16,12 +17,15 @@ #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> -#include "read_write.h" #include "internal.h" #include <asm/uaccess.h> #include <asm/unistd.h> +typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); +typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *, + unsigned long, loff_t); + const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -37,8 +41,19 @@ static inline int unsigned_offsets(struct file *file) return file->f_mode & FMODE_UNSIGNED_OFFSET; } -static loff_t lseek_execute(struct file *file, struct inode *inode, - loff_t offset, loff_t maxsize) +/** + * vfs_setpos - update the file offset for lseek + * @file: file structure in question + * @offset: file offset to seek to + * @maxsize: maximum file size + * + * This is a low-level filesystem helper for updating the file offset to + * the value specified by @offset if the given offset is valid and it is + * not equal to the current file offset. + * + * Return the specified offset on success and -EINVAL on invalid offset. + */ +loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; @@ -51,6 +66,7 @@ static loff_t lseek_execute(struct file *file, struct inode *inode, } return offset; } +EXPORT_SYMBOL(vfs_setpos); /** * generic_file_llseek_size - generic llseek implementation for regular files @@ -72,8 +88,6 @@ loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof) { - struct inode *inode = file->f_mapping->host; - switch (whence) { case SEEK_END: offset += eof; @@ -93,8 +107,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * like SEEK_SET. */ spin_lock(&file->f_lock); - offset = lseek_execute(file, inode, file->f_pos + offset, - maxsize); + offset = vfs_setpos(file, file->f_pos + offset, maxsize); spin_unlock(&file->f_lock); return offset; case SEEK_DATA: @@ -116,7 +129,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, break; } - return lseek_execute(file, inode, offset, maxsize); + return vfs_setpos(file, offset, maxsize); } EXPORT_SYMBOL(generic_file_llseek_size); @@ -128,7 +141,7 @@ EXPORT_SYMBOL(generic_file_llseek_size); * * This is a generic implemenation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by - * @offset and @whence under i_mutex. + * @offset and @whence. */ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { @@ -141,6 +154,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) EXPORT_SYMBOL(generic_file_llseek); /** + * fixed_size_llseek - llseek implementation for fixed-sized devices + * @file: file structure to seek on + * @offset: file offset to seek to + * @whence: type of seek + * @size: size of the file + * + */ +loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) +{ + switch (whence) { + case SEEK_SET: case SEEK_CUR: case SEEK_END: + return generic_file_llseek_size(file, offset, whence, + size, size); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL(fixed_size_llseek); + +/** * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to @@ -292,7 +325,7 @@ out_putf: * them to something that fits in "int" so that others * won't have to do range checks all the time. */ -int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count) +int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { struct inode *inode; loff_t pos; @@ -326,16 +359,6 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; } -static void wait_on_retry_sync_kiocb(struct kiocb *iocb) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - if (!kiocbIsKicked(iocb)) - schedule(); - else - kiocbClearKicked(iocb); - __set_current_state(TASK_RUNNING); -} - ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = len }; @@ -347,13 +370,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -403,13 +420,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -459,6 +470,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ ret = rw_verify_area(WRITE, file, pos, count); if (ret >= 0) { count = ret; + file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else @@ -468,6 +480,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ add_wchar(current, ret); } inc_syscw(current); + file_end_write(file); } return ret; @@ -493,7 +506,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) if (f.file) { loff_t pos = file_pos_read(f.file); ret = vfs_read(f.file, buf, count, &pos); - file_pos_write(f.file, pos); + if (ret >= 0) + file_pos_write(f.file, pos); fdput(f); } return ret; @@ -508,15 +522,16 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, if (f.file) { loff_t pos = file_pos_read(f.file); ret = vfs_write(f.file, buf, count, &pos); - file_pos_write(f.file, pos); + if (ret >= 0) + file_pos_write(f.file, pos); fdput(f); } return ret; } -SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf, - size_t count, loff_t pos) +SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, + size_t, count, loff_t, pos) { struct fd f; ssize_t ret = -EBADF; @@ -534,17 +549,9 @@ SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf, return ret; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos) -{ - return SYSC_pread64((unsigned int) fd, (char __user *) buf, - (size_t) count, pos); -} -SYSCALL_ALIAS(sys_pread64, SyS_pread64); -#endif -SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf, - size_t count, loff_t pos) +SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, + size_t, count, loff_t, pos) { struct fd f; ssize_t ret = -EBADF; @@ -562,14 +569,6 @@ SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf, return ret; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos) -{ - return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf, - (size_t) count, pos); -} -SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64); -#endif /* * Reduce an iovec's length in-place. Return the resulting number of segments @@ -592,7 +591,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) } EXPORT_SYMBOL(iov_shorten); -ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, +static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) { struct kiocb kiocb; @@ -603,13 +602,7 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -617,7 +610,7 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, } /* Do it by hand, with file-ops */ -ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, +static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, unsigned long nr_segs, loff_t *ppos, io_fn_t fn) { struct iovec *vector = iov; @@ -759,6 +752,7 @@ static ssize_t do_readv_writev(int type, struct file *file, } else { fn = (io_fn_t)file->f_op->write; fnv = file->f_op->aio_write; + file_start_write(file); } if (fnv) @@ -767,6 +761,9 @@ static ssize_t do_readv_writev(int type, struct file *file, else ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); + if (type != READ) + file_end_write(file); + out: if (iov != iovstack) kfree(iov); @@ -814,7 +811,8 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, if (f.file) { loff_t pos = file_pos_read(f.file); ret = vfs_readv(f.file, vec, vlen, &pos); - file_pos_write(f.file, pos); + if (ret >= 0) + file_pos_write(f.file, pos); fdput(f); } @@ -833,7 +831,8 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, if (f.file) { loff_t pos = file_pos_read(f.file); ret = vfs_writev(f.file, vec, vlen, &pos); - file_pos_write(f.file, pos); + if (ret >= 0) + file_pos_write(f.file, pos); fdput(f); } @@ -897,12 +896,210 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, return ret; } -ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, - loff_t max) +#ifdef CONFIG_COMPAT + +static ssize_t compat_do_readv_writev(int type, struct file *file, + const struct compat_iovec __user *uvector, + unsigned long nr_segs, loff_t *pos) +{ + compat_ssize_t tot_len; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + ssize_t ret; + io_fn_t fn; + iov_fn_t fnv; + + ret = -EINVAL; + if (!file->f_op) + goto out; + + ret = -EFAULT; + if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) + goto out; + + ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, + UIO_FASTIOV, iovstack, &iov); + if (ret <= 0) + goto out; + + tot_len = ret; + ret = rw_verify_area(type, file, pos, tot_len); + if (ret < 0) + goto out; + + fnv = NULL; + if (type == READ) { + fn = file->f_op->read; + fnv = file->f_op->aio_read; + } else { + fn = (io_fn_t)file->f_op->write; + fnv = file->f_op->aio_write; + file_start_write(file); + } + + if (fnv) + ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, + pos, fnv); + else + ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); + + if (type != READ) + file_end_write(file); + +out: + if (iov != iovstack) + kfree(iov); + if ((ret + (type == READ)) > 0) { + if (type == READ) + fsnotify_access(file); + else + fsnotify_modify(file); + } + return ret; +} + +static size_t compat_readv(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + ssize_t ret = -EBADF; + + if (!(file->f_mode & FMODE_READ)) + goto out; + + ret = -EINVAL; + if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) + goto out; + + ret = compat_do_readv_writev(READ, file, vec, vlen, pos); + +out: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen) +{ + struct fd f = fdget(fd); + ssize_t ret; + loff_t pos; + + if (!f.file) + return -EBADF; + pos = f.file->f_pos; + ret = compat_readv(f.file, vec, vlen, &pos); + if (ret >= 0) + f.file->f_pos = pos; + fdput(f); + return ret; +} + +COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos) +{ + struct fd f; + ssize_t ret; + + if (pos < 0) + return -EINVAL; + f = fdget(fd); + if (!f.file) + return -EBADF; + ret = -ESPIPE; + if (f.file->f_mode & FMODE_PREAD) + ret = compat_readv(f.file, vec, vlen, &pos); + fdput(f); + return ret; +} + +COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, u32, pos_low, u32, pos_high) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + return compat_sys_preadv64(fd, vec, vlen, pos); +} + +static size_t compat_writev(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + ssize_t ret = -EBADF; + + if (!(file->f_mode & FMODE_WRITE)) + goto out; + + ret = -EINVAL; + if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) + goto out; + + ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); + +out: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + +COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd, + const struct compat_iovec __user *, vec, + unsigned long, vlen) +{ + struct fd f = fdget(fd); + ssize_t ret; + loff_t pos; + + if (!f.file) + return -EBADF; + pos = f.file->f_pos; + ret = compat_writev(f.file, vec, vlen, &pos); + if (ret >= 0) + f.file->f_pos = pos; + fdput(f); + return ret; +} + +COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos) +{ + struct fd f; + ssize_t ret; + + if (pos < 0) + return -EINVAL; + f = fdget(fd); + if (!f.file) + return -EBADF; + ret = -ESPIPE; + if (f.file->f_mode & FMODE_PWRITE) + ret = compat_writev(f.file, vec, vlen, &pos); + fdput(f); + return ret; +} + +COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, u32, pos_low, u32, pos_high) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + return compat_sys_pwritev64(fd, vec, vlen, pos); +} +#endif + +static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, + size_t count, loff_t max) { struct fd in, out; struct inode *in_inode, *out_inode; loff_t pos; + loff_t out_pos; ssize_t retval; int fl; @@ -916,12 +1113,14 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, if (!(in.file->f_mode & FMODE_READ)) goto fput_in; retval = -ESPIPE; - if (!ppos) - ppos = &in.file->f_pos; - else + if (!ppos) { + pos = in.file->f_pos; + } else { + pos = *ppos; if (!(in.file->f_mode & FMODE_PREAD)) goto fput_in; - retval = rw_verify_area(READ, in.file, ppos, count); + } + retval = rw_verify_area(READ, in.file, &pos, count); if (retval < 0) goto fput_in; count = retval; @@ -938,7 +1137,8 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, retval = -EINVAL; in_inode = file_inode(in.file); out_inode = file_inode(out.file); - retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count); + out_pos = out.file->f_pos; + retval = rw_verify_area(WRITE, out.file, &out_pos, count); if (retval < 0) goto fput_out; count = retval; @@ -946,7 +1146,6 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); - pos = *ppos; if (unlikely(pos + count > max)) { retval = -EOVERFLOW; if (pos >= max) @@ -965,18 +1164,25 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif - retval = do_splice_direct(in.file, ppos, out.file, count, fl); + file_start_write(out.file); + retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); + file_end_write(out.file); if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(in.file); fsnotify_modify(out.file); + out.file->f_pos = out_pos; + if (ppos) + *ppos = pos; + else + in.file->f_pos = pos; } inc_syscr(current); inc_syscw(current); - if (*ppos > max) + if (pos > max) retval = -EOVERFLOW; fput_out: @@ -1022,3 +1228,43 @@ SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, si return do_sendfile(out_fd, in_fd, NULL, count, 0); } + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, + compat_off_t __user *, offset, compat_size_t, count) +{ + loff_t pos; + off_t off; + ssize_t ret; + + if (offset) { + if (unlikely(get_user(off, offset))) + return -EFAULT; + pos = off; + ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); + if (unlikely(put_user(pos, offset))) + return -EFAULT; + return ret; + } + + return do_sendfile(out_fd, in_fd, NULL, count, 0); +} + +COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, + compat_loff_t __user *, offset, compat_size_t, count) +{ + loff_t pos; + ssize_t ret; + + if (offset) { + if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) + return -EFAULT; + ret = do_sendfile(out_fd, in_fd, &pos, count, 0); + if (unlikely(put_user(pos, offset))) + return -EFAULT; + return ret; + } + + return do_sendfile(out_fd, in_fd, NULL, count, 0); +} +#endif diff --git a/fs/read_write.h b/fs/read_write.h deleted file mode 100644 index d3e00ef67420..000000000000 --- a/fs/read_write.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * This file is only for sharing some helpers from read_write.c with compat.c. - * Don't use anywhere else. - */ - - -typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); -typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *, - unsigned long, loff_t); - -ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, - unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn); -ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, - unsigned long nr_segs, loff_t *ppos, io_fn_t fn); -ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, - loff_t max); diff --git a/fs/readdir.c b/fs/readdir.c index fee38e04fae4..93d71e574310 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -20,11 +20,11 @@ #include <asm/uaccess.h> -int vfs_readdir(struct file *file, filldir_t filler, void *buf) +int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); int res = -ENOTDIR; - if (!file->f_op || !file->f_op->readdir) + if (!file->f_op || !file->f_op->iterate) goto out; res = security_file_permission(file, MAY_READ); @@ -37,15 +37,16 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf) res = -ENOENT; if (!IS_DEADDIR(inode)) { - res = file->f_op->readdir(file, buf, filler); + ctx->pos = file->f_pos; + res = file->f_op->iterate(file, ctx); + file->f_pos = ctx->pos; file_accessed(file); } mutex_unlock(&inode->i_mutex); out: return res; } - -EXPORT_SYMBOL(vfs_readdir); +EXPORT_SYMBOL(iterate_dir); /* * Traditional linux readdir() handling.. @@ -66,6 +67,7 @@ struct old_linux_dirent { }; struct readdir_callback { + struct dir_context ctx; struct old_linux_dirent __user * dirent; int result; }; @@ -73,7 +75,7 @@ struct readdir_callback { static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct readdir_callback * buf = (struct readdir_callback *) __buf; + struct readdir_callback *buf = (struct readdir_callback *) __buf; struct old_linux_dirent __user * dirent; unsigned long d_ino; @@ -107,15 +109,15 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd, { int error; struct fd f = fdget(fd); - struct readdir_callback buf; + struct readdir_callback buf = { + .ctx.actor = fillonedir, + .dirent = dirent + }; if (!f.file) return -EBADF; - buf.result = 0; - buf.dirent = dirent; - - error = vfs_readdir(f.file, fillonedir, &buf); + error = iterate_dir(f.file, &buf.ctx); if (buf.result) error = buf.result; @@ -137,6 +139,7 @@ struct linux_dirent { }; struct getdents_callback { + struct dir_context ctx; struct linux_dirent __user * current_dir; struct linux_dirent __user * previous; int count; @@ -191,7 +194,11 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, { struct fd f; struct linux_dirent __user * lastdirent; - struct getdents_callback buf; + struct getdents_callback buf = { + .ctx.actor = filldir, + .count = count, + .current_dir = dirent + }; int error; if (!access_ok(VERIFY_WRITE, dirent, count)) @@ -201,17 +208,12 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, if (!f.file) return -EBADF; - buf.current_dir = dirent; - buf.previous = NULL; - buf.count = count; - buf.error = 0; - - error = vfs_readdir(f.file, filldir, &buf); + error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; lastdirent = buf.previous; if (lastdirent) { - if (put_user(f.file->f_pos, &lastdirent->d_off)) + if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; @@ -221,6 +223,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, } struct getdents_callback64 { + struct dir_context ctx; struct linux_dirent64 __user * current_dir; struct linux_dirent64 __user * previous; int count; @@ -271,7 +274,11 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, { struct fd f; struct linux_dirent64 __user * lastdirent; - struct getdents_callback64 buf; + struct getdents_callback64 buf = { + .ctx.actor = filldir64, + .count = count, + .current_dir = dirent + }; int error; if (!access_ok(VERIFY_WRITE, dirent, count)) @@ -281,17 +288,12 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, if (!f.file) return -EBADF; - buf.current_dir = dirent; - buf.previous = NULL; - buf.count = count; - buf.error = 0; - - error = vfs_readdir(f.file, filldir64, &buf); + error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; lastdirent = buf.previous; if (lastdirent) { - typeof(lastdirent->d_off) d_off = f.file->f_pos; + typeof(lastdirent->d_off) d_off = buf.ctx.pos; if (__put_user(d_off, &lastdirent->d_off)) error = -EFAULT; else diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 66c53b642a88..03e4ca5624d6 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -13,14 +13,14 @@ extern const struct reiserfs_key MIN_KEY; -static int reiserfs_readdir(struct file *, void *, filldir_t); +static int reiserfs_readdir(struct file *, struct dir_context *); static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, int datasync); const struct file_operations reiserfs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = reiserfs_readdir, + .iterate = reiserfs_readdir, .fsync = reiserfs_dir_fsync, .unlocked_ioctl = reiserfs_ioctl, #ifdef CONFIG_COMPAT @@ -50,18 +50,15 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, #define store_ih(where,what) copy_item_head (where, what) -static inline bool is_privroot_deh(struct dentry *dir, - struct reiserfs_de_head *deh) +static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh) { - struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root; - return (dir == dir->d_parent && privroot->d_inode && + struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root; + return (privroot->d_inode && deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); } -int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, - filldir_t filldir, loff_t *pos) +int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) { - struct inode *inode = dentry->d_inode; struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ INITIALIZE_PATH(path_to_entry); struct buffer_head *bh; @@ -81,7 +78,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, /* form key for search the next directory entry using f_pos field of file structure */ - make_cpu_key(&pos_key, inode, *pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3); + make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3); next_pos = cpu_key_k_offset(&pos_key); path_to_entry.reada = PATH_READA; @@ -126,7 +123,6 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, entry_num++, deh++) { int d_reclen; char *d_name; - off_t d_off; ino_t d_ino; if (!de_visible(deh)) @@ -155,11 +151,10 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, } /* Ignore the .reiserfs_priv entry */ - if (is_privroot_deh(dentry, deh)) + if (is_privroot_deh(inode, deh)) continue; - d_off = deh_offset(deh); - *pos = d_off; + ctx->pos = deh_offset(deh); d_ino = deh_objectid(deh); if (d_reclen <= 32) { local_buf = small_buf; @@ -187,9 +182,9 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, * the write lock here for other waiters */ reiserfs_write_unlock(inode->i_sb); - if (filldir - (dirent, local_buf, d_reclen, d_off, d_ino, - DT_UNKNOWN) < 0) { + if (!dir_emit + (ctx, local_buf, d_reclen, d_ino, + DT_UNKNOWN)) { reiserfs_write_lock(inode->i_sb); if (local_buf != small_buf) { kfree(local_buf); @@ -204,6 +199,8 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, next_pos = deh_offset(deh) + 1; if (item_moved(&tmp_ih, &path_to_entry)) { + set_cpu_key_k_offset(&pos_key, + next_pos); goto research; } } /* for */ @@ -235,7 +232,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, } /* while */ end: - *pos = next_pos; + ctx->pos = next_pos; pathrelse(&path_to_entry); reiserfs_check_path(&path_to_entry); out: @@ -243,10 +240,9 @@ out: return ret; } -static int reiserfs_readdir(struct file *file, void *dirent, filldir_t filldir) +static int reiserfs_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = file->f_path.dentry; - return reiserfs_readdir_dentry(dentry, dirent, filldir, &file->f_pos); + return reiserfs_readdir_inode(file_inode(file), ctx); } /* compose directory item containing "." and ".." entries (entries are diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 6165bd4784f6..dcaafcfc23b0 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -234,68 +234,9 @@ int reiserfs_commit_page(struct inode *inode, struct page *page, return ret; } -/* Write @count bytes at position @ppos in a file indicated by @file - from the buffer @buf. - - generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want - something simple that works. It is not for serious use by general purpose filesystems, excepting the one that it was - written for (ext2/3). This is for several reasons: - - * It has no understanding of any filesystem specific optimizations. - - * It enters the filesystem repeatedly for each page that is written. - - * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key - * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time - * to reiserfs which allows for fewer tree traversals. - - * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks. - - * Asking the block allocation code for blocks one at a time is slightly less efficient. - - All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to - use it, but we were in a hurry to make code freeze, and so it couldn't be revised then. This new code should make - things right finally. - - Future Features: providing search_by_key with hints. - -*/ -static ssize_t reiserfs_file_write(struct file *file, /* the file we are going to write into */ - const char __user * buf, /* pointer to user supplied data - (in userspace) */ - size_t count, /* amount of bytes to write */ - loff_t * ppos /* pointer to position in file that we start writing at. Should be updated to - * new current position before returning. */ - ) -{ - struct inode *inode = file_inode(file); // Inode of the file that we are writing to. - /* To simplify coding at this time, we store - locked pages in array for now */ - struct reiserfs_transaction_handle th; - th.t_trans_id = 0; - - /* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items - * lying around (most of the disk, in fact). Despite the filesystem - * now being a v3.6 format, the old items still can't support large - * file sizes. Catch this case here, as the rest of the VFS layer is - * oblivious to the different limitations between old and new items. - * reiserfs_setattr catches this for truncates. This chunk is lifted - * from generic_write_checks. */ - if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 && - *ppos + count > MAX_NON_LFS) { - if (*ppos >= MAX_NON_LFS) { - return -EFBIG; - } - if (count > MAX_NON_LFS - (unsigned long)*ppos) - count = MAX_NON_LFS - (unsigned long)*ppos; - } - - return do_sync_write(file, buf, count, ppos); -} - const struct file_operations reiserfs_file_operations = { .read = do_sync_read, - .write = reiserfs_file_write, + .write = do_sync_write, .unlocked_ioctl = reiserfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = reiserfs_compat_ioctl, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ea5061fd4f3e..0048cc16a6a8 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -18,6 +18,7 @@ #include <linux/writeback.h> #include <linux/quotaops.h> #include <linux/swap.h> +#include <linux/aio.h> int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); @@ -1810,11 +1811,16 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); - if (insert_inode_locked4(inode, args.objectid, - reiserfs_find_actor, &args) < 0) { + + reiserfs_write_unlock(inode->i_sb); + err = insert_inode_locked4(inode, args.objectid, + reiserfs_find_actor, &args); + reiserfs_write_lock(inode->i_sb); + if (err) { err = -EINVAL; goto out_bad_inode; } + if (old_format_only(sb)) /* not a perfect generation count, as object ids can be reused, but ** this is as good as reiserfs can do right now. @@ -2969,16 +2975,19 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) } /* clm -- taken from fs/buffer.c:block_invalidate_page */ -static void reiserfs_invalidatepage(struct page *page, unsigned long offset) +static void reiserfs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct buffer_head *head, *bh, *next; struct inode *inode = page->mapping->host; unsigned int curr_off = 0; + unsigned int stop = offset + length; + int partial_page = (offset || length < PAGE_CACHE_SIZE); int ret = 1; BUG_ON(!PageLocked(page)); - if (offset == 0) + if (!partial_page) ClearPageChecked(page); if (!page_has_buffers(page)) @@ -2990,6 +2999,9 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset) unsigned int next_off = curr_off + bh->b_size; next = bh->b_this_page; + if (next_off > stop) + goto out; + /* * is this block fully invalidated? */ @@ -3008,7 +3020,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset) * The get_block cached value has been unconditionally invalidated, * so real IO is not possible anymore. */ - if (!offset && ret) { + if (!partial_page && ret) { ret = try_to_release_page(page, 0); /* maybe should BUG_ON(!ret); - neilb */ } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index afcadcc03e8a..742fdd4c209a 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -97,7 +97,7 @@ static int flush_commit_list(struct super_block *s, static int can_dirty(struct reiserfs_journal_cnode *cn); static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *sb, unsigned long nblocks); -static int release_journal_dev(struct super_block *super, +static void release_journal_dev(struct super_block *super, struct reiserfs_journal *journal); static int dirty_one_transaction(struct super_block *s, struct reiserfs_journal_list *jl); @@ -2532,23 +2532,13 @@ static void journal_list_init(struct super_block *sb) SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb); } -static int release_journal_dev(struct super_block *super, +static void release_journal_dev(struct super_block *super, struct reiserfs_journal *journal) { - int result; - - result = 0; - if (journal->j_dev_bd != NULL) { - result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode); + blkdev_put(journal->j_dev_bd, journal->j_dev_mode); journal->j_dev_bd = NULL; } - - if (result != 0) { - reiserfs_warning(super, "sh-457", - "Cannot release journal device: %i", result); - } - return result; } static int journal_init_dev(struct super_block *super, diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 9cc0740adffa..33532f79b4f7 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -394,20 +394,24 @@ static int set_sb(struct super_block *sb, void *data) return -ENOENT; } +struct reiserfs_seq_private { + struct super_block *sb; + int (*show) (struct seq_file *, struct super_block *); +}; + static void *r_start(struct seq_file *m, loff_t * pos) { - struct proc_dir_entry *de = m->private; - struct super_block *s = de->parent->data; + struct reiserfs_seq_private *priv = m->private; loff_t l = *pos; if (l) return NULL; - if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, s))) + if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, priv->sb))) return NULL; - up_write(&s->s_umount); - return s; + up_write(&priv->sb->s_umount); + return priv->sb; } static void *r_next(struct seq_file *m, void *v, loff_t * pos) @@ -426,9 +430,8 @@ static void r_stop(struct seq_file *m, void *v) static int r_show(struct seq_file *m, void *v) { - struct proc_dir_entry *de = m->private; - int (*show) (struct seq_file *, struct super_block *) = de->data; - return show(m, v); + struct reiserfs_seq_private *priv = m->private; + return priv->show(m, v); } static const struct seq_operations r_ops = { @@ -440,11 +443,15 @@ static const struct seq_operations r_ops = { static int r_open(struct inode *inode, struct file *file) { - int ret = seq_open(file, &r_ops); + struct reiserfs_seq_private *priv; + int ret = seq_open_private(file, &r_ops, + sizeof(struct reiserfs_seq_private)); if (!ret) { struct seq_file *m = file->private_data; - m->private = PDE(inode); + priv = m->private; + priv->sb = proc_get_parent_data(inode); + priv->show = PDE_DATA(inode); } return ret; } @@ -453,7 +460,7 @@ static const struct file_operations r_file_operations = { .open = r_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, .owner = THIS_MODULE, }; @@ -479,9 +486,8 @@ int reiserfs_proc_info_init(struct super_block *sb) *s = '!'; spin_lock_init(&__PINFO(sb).lock); - REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root); + REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb); if (REISERFS_SB(sb)->procdir) { - REISERFS_SB(sb)->procdir->data = sb; add_file(sb, "version", show_version); add_file(sb, "super", show_super); add_file(sb, "per-level", show_per_level); @@ -499,29 +505,17 @@ int reiserfs_proc_info_init(struct super_block *sb) int reiserfs_proc_info_done(struct super_block *sb) { struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; - char b[BDEVNAME_SIZE]; - char *s; + if (de) { + char b[BDEVNAME_SIZE]; + char *s; - /* Some block devices use /'s */ - strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); - s = strchr(b, '/'); - if (s) - *s = '!'; + /* Some block devices use /'s */ + strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + s = strchr(b, '/'); + if (s) + *s = '!'; - if (de) { - remove_proc_entry("journal", de); - remove_proc_entry("oidmap", de); - remove_proc_entry("on-disk-super", de); - remove_proc_entry("bitmap", de); - remove_proc_entry("per-level", de); - remove_proc_entry("super", de); - remove_proc_entry("version", de); - } - spin_lock(&__PINFO(sb).lock); - __PINFO(sb).exiting = 1; - spin_unlock(&__PINFO(sb).lock); - if (proc_info_root) { - remove_proc_entry(b, proc_info_root); + remove_proc_subtree(b, proc_info_root); REISERFS_SB(sb)->procdir = NULL; } return 0; diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 157e474ab303..3df5ce6c724d 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -2709,7 +2709,7 @@ extern const struct inode_operations reiserfs_dir_inode_operations; extern const struct inode_operations reiserfs_symlink_inode_operations; extern const struct inode_operations reiserfs_special_inode_operations; extern const struct file_operations reiserfs_dir_operations; -int reiserfs_readdir_dentry(struct dentry *, void *, filldir_t, loff_t *); +int reiserfs_readdir_inode(struct inode *, struct dir_context *); /* tail_conversion.c */ int direct2indirect(struct reiserfs_transaction_handle *, struct inode *, diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 4cce1d9552fb..c69cdd749f09 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -171,6 +171,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) * modifying extended attributes. This includes operations such as permissions * or ownership changes, object deletions, etc. */ struct reiserfs_dentry_buf { + struct dir_context ctx; struct dentry *xadir; int count; struct dentry *dentries[8]; @@ -223,9 +224,8 @@ static int reiserfs_for_each_xattr(struct inode *inode, { struct dentry *dir; int i, err = 0; - loff_t pos = 0; struct reiserfs_dentry_buf buf = { - .count = 0, + .ctx.actor = fill_with_dentries, }; /* Skip out, an xattr has no xattrs associated with it */ @@ -249,29 +249,27 @@ static int reiserfs_for_each_xattr(struct inode *inode, reiserfs_write_lock(inode->i_sb); buf.xadir = dir; - err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos); - while ((err == 0 || err == -ENOSPC) && buf.count) { - err = 0; - - for (i = 0; i < buf.count && buf.dentries[i]; i++) { - int lerr = 0; + while (1) { + err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx); + if (err) + break; + if (!buf.count) + break; + for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) { struct dentry *dentry = buf.dentries[i]; - if (err == 0 && !S_ISDIR(dentry->d_inode->i_mode)) - lerr = action(dentry, data); + if (!S_ISDIR(dentry->d_inode->i_mode)) + err = action(dentry, data); dput(dentry); buf.dentries[i] = NULL; - err = lerr ?: err; } + if (err) + break; buf.count = 0; - if (!err) - err = reiserfs_readdir_dentry(dir, &buf, - fill_with_dentries, &pos); } mutex_unlock(&dir->d_inode->i_mutex); - /* Clean up after a failed readdir */ cleanup_dentry_buf(&buf); if (!err) { @@ -318,7 +316,19 @@ static int delete_one_xattr(struct dentry *dentry, void *data) static int chown_one_xattr(struct dentry *dentry, void *data) { struct iattr *attrs = data; - return reiserfs_setattr(dentry, attrs); + int ia_valid = attrs->ia_valid; + int err; + + /* + * We only want the ownership bits. Otherwise, we'll do + * things like change a directory to a regular file if + * ATTR_MODE is set. + */ + attrs->ia_valid &= (ATTR_UID|ATTR_GID); + err = reiserfs_setattr(dentry, attrs); + attrs->ia_valid = ia_valid; + + return err; } /* No i_mutex, but the inode is unconnected. */ @@ -788,6 +798,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name) } struct listxattr_buf { + struct dir_context ctx; size_t size; size_t pos; char *buf; @@ -833,8 +844,8 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) { struct dentry *dir; int err = 0; - loff_t pos = 0; struct listxattr_buf buf = { + .ctx.actor = listxattr_filler, .dentry = dentry, .buf = buffer, .size = buffer ? size : 0, @@ -856,7 +867,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) } mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); - err = reiserfs_readdir_dentry(dir, &buf, listxattr_filler, &pos); + err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx); mutex_unlock(&dir->d_inode->i_mutex); if (!err) diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index d7c01ef64eda..6c8767fdfc6a 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -443,6 +443,9 @@ int reiserfs_acl_chmod(struct inode *inode) int depth; int error; + if (IS_PRIVATE(inode)) + return 0; + if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index e1a7779dd3cb..f373bde8f545 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -49,8 +49,11 @@ static unsigned long romfs_get_unmapped_area(struct file *file, return (unsigned long) -EINVAL; offset += ROMFS_I(inode)->i_dataoffset; - if (offset > mtd->size - len) + if (offset >= mtd->size) return (unsigned long) -EINVAL; + /* the mapping mustn't extend beyond the EOF */ + if ((offset + len) > mtd->size) + len = mtd->size - offset; ret = mtd_get_unmapped_area(mtd, len, offset, flags); if (ret == -EOPNOTSUPP) diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 15cbc41ee365..ff1d3d42e72a 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -145,19 +145,18 @@ static const struct address_space_operations romfs_aops = { /* * read the entries from a directory */ -static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int romfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *i = file_inode(filp); + struct inode *i = file_inode(file); struct romfs_inode ri; unsigned long offset, maxoff; int j, ino, nextfh; - int stored = 0; char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ int ret; maxoff = romfs_maxsize(i->i_sb); - offset = filp->f_pos; + offset = ctx->pos; if (!offset) { offset = i->i_ino & ROMFH_MASK; ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); @@ -170,10 +169,10 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) for (;;) { if (!offset || offset >= maxoff) { offset = maxoff; - filp->f_pos = offset; + ctx->pos = offset; goto out; } - filp->f_pos = offset; + ctx->pos = offset; /* Fetch inode info */ ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); @@ -194,16 +193,14 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) nextfh = be32_to_cpu(ri.next); if ((nextfh & ROMFH_TYPE) == ROMFH_HRD) ino = be32_to_cpu(ri.spec); - if (filldir(dirent, fsname, j, offset, ino, - romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) + if (!dir_emit(ctx, fsname, j, ino, + romfs_dtype_table[nextfh & ROMFH_TYPE])) goto out; - stored++; offset = nextfh & ROMFH_MASK; } - out: - return stored; + return 0; } /* @@ -281,7 +278,7 @@ error: static const struct file_operations romfs_dir_operations = { .read = generic_read_dir, - .readdir = romfs_readdir, + .iterate = romfs_readdir, .llseek = default_llseek, }; diff --git a/fs/select.c b/fs/select.c index 8c1c96c27062..f9f49c40cfd4 100644 --- a/fs/select.c +++ b/fs/select.c @@ -27,6 +27,8 @@ #include <linux/rcupdate.h> #include <linux/hrtimer.h> #include <linux/sched/rt.h> +#include <linux/freezer.h> +#include <net/ll_poll.h> #include <asm/uaccess.h> @@ -236,7 +238,8 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state, set_current_state(state); if (!pwq->triggered) - rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); + rc = freezable_schedule_hrtimeout_range(expires, slack, + HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* @@ -384,9 +387,10 @@ get_max: #define POLLEX_SET (POLLPRI) static inline void wait_key_set(poll_table *wait, unsigned long in, - unsigned long out, unsigned long bit) + unsigned long out, unsigned long bit, + unsigned int ll_flag) { - wait->_key = POLLEX_SET; + wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) @@ -400,6 +404,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) poll_table *wait; int retval, i, timed_out = 0; unsigned long slack = 0; + unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; + unsigned long busy_end = 0; rcu_read_lock(); retval = max_select_fd(n, fds); @@ -422,6 +428,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; + bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; @@ -449,7 +456,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) f_op = f.file->f_op; mask = DEFAULT_POLLMASK; if (f_op && f_op->poll) { - wait_key_set(wait, in, out, bit); + wait_key_set(wait, in, out, + bit, busy_flag); mask = (*f_op->poll)(f.file, wait); } fdput(f); @@ -468,6 +476,18 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) retval++; wait->_qproc = NULL; } + /* got something, stop busy polling */ + if (retval) { + can_busy_loop = false; + busy_flag = 0; + + /* + * only remember a returned + * POLL_BUSY_LOOP if we asked for it + */ + } else if (busy_flag & mask) + can_busy_loop = true; + } } if (res_in) @@ -486,6 +506,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) break; } + /* only if found POLL_BUSY_LOOP sockets && not out of time */ + if (can_busy_loop && !need_resched()) { + if (!busy_end) { + busy_end = busy_loop_end_time(); + continue; + } + if (!busy_loop_timeout(busy_end)) + continue; + } + busy_flag = 0; + /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to @@ -717,7 +748,9 @@ struct poll_list { * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ -static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) +static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, + bool *can_busy_poll, + unsigned int busy_flag) { unsigned int mask; int fd; @@ -731,7 +764,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) mask = DEFAULT_POLLMASK; if (f.file->f_op && f.file->f_op->poll) { pwait->_key = pollfd->events|POLLERR|POLLHUP; + pwait->_key |= busy_flag; mask = f.file->f_op->poll(f.file, pwait); + if (mask & busy_flag) + *can_busy_poll = true; } /* Mask out unneeded events. */ mask &= pollfd->events | POLLERR | POLLHUP; @@ -750,6 +786,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list, ktime_t expire, *to = NULL; int timed_out = 0, count = 0; unsigned long slack = 0; + unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; + unsigned long busy_end = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -762,6 +800,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, for (;;) { struct poll_list *walk; + bool can_busy_loop = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; @@ -776,9 +815,13 @@ static int do_poll(unsigned int nfds, struct poll_list *list, * this. They'll get immediately deregistered * when we break out and return. */ - if (do_pollfd(pfd, pt)) { + if (do_pollfd(pfd, pt, &can_busy_loop, + busy_flag)) { count++; pt->_qproc = NULL; + /* found something, stop busy polling */ + busy_flag = 0; + can_busy_loop = false; } } } @@ -795,6 +838,17 @@ static int do_poll(unsigned int nfds, struct poll_list *list, if (count || timed_out) break; + /* only if found POLL_BUSY_LOOP sockets && not out of time */ + if (can_busy_loop && !need_resched()) { + if (!busy_end) { + busy_end = busy_loop_end_time(); + continue; + } + if (!busy_loop_timeout(busy_end)) + continue; + } + busy_flag = 0; + /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to diff --git a/fs/seq_file.c b/fs/seq_file.c index 38bb59f3f2ad..3135c2525c76 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -599,6 +599,24 @@ int single_open(struct file *file, int (*show)(struct seq_file *, void *), } EXPORT_SYMBOL(single_open); +int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), + void *data, size_t size) +{ + char *buf = kmalloc(size, GFP_KERNEL); + int ret; + if (!buf) + return -ENOMEM; + ret = single_open(file, show, data); + if (ret) { + kfree(buf); + return ret; + } + ((struct seq_file *)file->private_data)->buf = buf; + ((struct seq_file *)file->private_data)->size = size; + return 0; +} +EXPORT_SYMBOL(single_open_size); + int single_release(struct inode *inode, struct file *file) { const struct seq_operations *op = ((struct seq_file *)file->private_data)->op; @@ -903,3 +921,57 @@ struct hlist_node *seq_hlist_next_rcu(void *v, return rcu_dereference(node->next); } EXPORT_SYMBOL(seq_hlist_next_rcu); + +/** + * seq_hlist_start_precpu - start an iteration of a percpu hlist array + * @head: pointer to percpu array of struct hlist_heads + * @cpu: pointer to cpu "cursor" + * @pos: start position of sequence + * + * Called at seq_file->op->start(). + */ +struct hlist_node * +seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos) +{ + struct hlist_node *node; + + for_each_possible_cpu(*cpu) { + hlist_for_each(node, per_cpu_ptr(head, *cpu)) { + if (pos-- == 0) + return node; + } + } + return NULL; +} +EXPORT_SYMBOL(seq_hlist_start_percpu); + +/** + * seq_hlist_next_percpu - move to the next position of the percpu hlist array + * @v: pointer to current hlist_node + * @head: pointer to percpu array of struct hlist_heads + * @cpu: pointer to cpu "cursor" + * @pos: start position of sequence + * + * Called at seq_file->op->next(). + */ +struct hlist_node * +seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, + int *cpu, loff_t *pos) +{ + struct hlist_node *node = v; + + ++*pos; + + if (node->next) + return node->next; + + for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids; + *cpu = cpumask_next(*cpu, cpu_possible_mask)) { + struct hlist_head *bucket = per_cpu_ptr(head, *cpu); + + if (!hlist_empty(bucket)) + return bucket->first; + } + return NULL; +} +EXPORT_SYMBOL(seq_hlist_next_percpu); diff --git a/fs/signalfd.c b/fs/signalfd.c index b53486961735..424b7b65321f 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -30,6 +30,7 @@ #include <linux/signalfd.h> #include <linux/syscalls.h> #include <linux/proc_fs.h> +#include <linux/compat.h> void signalfd_cleanup(struct sighand_struct *sighand) { @@ -311,3 +312,33 @@ SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask, { return sys_signalfd4(ufd, user_mask, sizemask, 0); } + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd, + const compat_sigset_t __user *,sigmask, + compat_size_t, sigsetsize, + int, flags) +{ + compat_sigset_t ss32; + sigset_t tmp; + sigset_t __user *ksigmask; + + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&tmp, &ss32); + ksigmask = compat_alloc_user_space(sizeof(sigset_t)); + if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t))) + return -EFAULT; + + return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags); +} + +COMPAT_SYSCALL_DEFINE3(signalfd, int, ufd, + const compat_sigset_t __user *,sigmask, + compat_size_t, sigsetsize) +{ + return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0); +} +#endif diff --git a/fs/splice.c b/fs/splice.c index 29e394e49ddd..3b7ee656f3aa 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -31,6 +31,7 @@ #include <linux/security.h> #include <linux/gfp.h> #include <linux/socket.h> +#include <linux/compat.h> #include "internal.h" /* @@ -218,7 +219,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, page_nr++; ret += buf->len; - if (pipe->inode) + if (pipe->files) do_wakeup = 1; if (!--spd->nr_pages) @@ -828,7 +829,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, ops->release(pipe, buf); pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; - if (pipe->inode) + if (pipe->files) sd->need_wakeup = true; } @@ -1000,8 +1001,6 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, }; ssize_t ret; - sb_start_write(inode->i_sb); - pipe_lock(pipe); splice_from_pipe_begin(&sd); @@ -1037,7 +1036,6 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, *ppos += ret; balance_dirty_pages_ratelimited(mapping); } - sb_end_write(inode->i_sb); return ret; } @@ -1100,17 +1098,6 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); - int ret; - - if (unlikely(!(out->f_mode & FMODE_WRITE))) - return -EBADF; - - if (unlikely(out->f_flags & O_APPEND)) - return -EINVAL; - - ret = rw_verify_area(WRITE, out, ppos, len); - if (unlikely(ret < 0)) - return ret; if (out->f_op && out->f_op->splice_write) splice_write = out->f_op->splice_write; @@ -1183,7 +1170,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, */ pipe = current->splice_pipe; if (unlikely(!pipe)) { - pipe = alloc_pipe_info(NULL); + pipe = alloc_pipe_info(); if (!pipe) return -ENOMEM; @@ -1273,7 +1260,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, { struct file *file = sd->u.file; - return do_splice_from(pipe, file, &file->f_pos, sd->total_len, + return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } @@ -1282,6 +1269,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, * @in: file to splice from * @ppos: input file offset * @out: file to splice to + * @opos: output file offset * @len: number of bytes to splice * @flags: splice modifier flags * @@ -1293,7 +1281,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, * */ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - size_t len, unsigned int flags) + loff_t *opos, size_t len, unsigned int flags) { struct splice_desc sd = { .len = len, @@ -1301,9 +1289,20 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, .flags = flags, .pos = *ppos, .u.file = out, + .opos = opos, }; long ret; + if (unlikely(!(out->f_mode & FMODE_WRITE))) + return -EBADF; + + if (unlikely(out->f_flags & O_APPEND)) + return -EINVAL; + + ret = rw_verify_area(WRITE, out, opos, len); + if (unlikely(ret < 0)) + return ret; + ret = splice_direct_to_actor(in, &sd, direct_splice_actor); if (ret > 0) *ppos = sd.pos; @@ -1324,7 +1323,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; - loff_t offset, *off; + loff_t offset; long ret; ipipe = get_pipe_info(in); @@ -1355,13 +1354,27 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; - off = &offset; - } else - off = &out->f_pos; + } else { + offset = out->f_pos; + } + + if (unlikely(!(out->f_mode & FMODE_WRITE))) + return -EBADF; + + if (unlikely(out->f_flags & O_APPEND)) + return -EINVAL; - ret = do_splice_from(ipipe, out, off, len, flags); + ret = rw_verify_area(WRITE, out, &offset, len); + if (unlikely(ret < 0)) + return ret; + + file_start_write(out); + ret = do_splice_from(ipipe, out, &offset, len, flags); + file_end_write(out); - if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) + if (!off_out) + out->f_pos = offset; + else if (copy_to_user(off_out, &offset, sizeof(loff_t))) ret = -EFAULT; return ret; @@ -1375,13 +1388,15 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; - off = &offset; - } else - off = &in->f_pos; + } else { + offset = in->f_pos; + } - ret = do_splice_to(in, off, opipe, len, flags); + ret = do_splice_to(in, &offset, opipe, len, flags); - if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) + if (!off_in) + in->f_pos = offset; + else if (copy_to_user(off_in, &offset, sizeof(loff_t))) ret = -EFAULT; return ret; @@ -1690,6 +1705,27 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, return error; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32, + unsigned int, nr_segs, unsigned int, flags) +{ + unsigned i; + struct iovec __user *iov; + if (nr_segs > UIO_MAXIOV) + return -EINVAL; + iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); + for (i = 0; i < nr_segs; i++) { + struct compat_iovec v; + if (get_user(v.iov_base, &iov32[i].iov_base) || + get_user(v.iov_len, &iov32[i].iov_len) || + put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || + put_user(v.iov_len, &iov[i].iov_len)) + return -EFAULT; + } + return sys_vmsplice(fd, iov, nr_segs, flags); +} +#endif + SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c index 57dc70ebbb19..f7f527bf8c10 100644 --- a/fs/squashfs/dir.c +++ b/fs/squashfs/dir.c @@ -100,7 +100,7 @@ static int get_dir_index_using_offset(struct super_block *sb, } -static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) +static int squashfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; @@ -127,11 +127,11 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) * It also means that the external f_pos is offset by 3 from the * on-disk directory f_pos. */ - while (file->f_pos < 3) { + while (ctx->pos < 3) { char *name; int i_ino; - if (file->f_pos == 0) { + if (ctx->pos == 0) { name = "."; size = 1; i_ino = inode->i_ino; @@ -141,24 +141,18 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) i_ino = squashfs_i(inode)->parent; } - TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n", - dirent, name, size, file->f_pos, i_ino, - squashfs_filetype_table[1]); - - if (filldir(dirent, name, size, file->f_pos, i_ino, - squashfs_filetype_table[1]) < 0) { - TRACE("Filldir returned less than 0\n"); + if (!dir_emit(ctx, name, size, i_ino, + squashfs_filetype_table[1])) goto finish; - } - file->f_pos += size; + ctx->pos += size; } length = get_dir_index_using_offset(inode->i_sb, &block, &offset, squashfs_i(inode)->dir_idx_start, squashfs_i(inode)->dir_idx_offset, squashfs_i(inode)->dir_idx_cnt, - file->f_pos); + ctx->pos); while (length < i_size_read(inode)) { /* @@ -198,7 +192,7 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) length += sizeof(*dire) + size; - if (file->f_pos >= length) + if (ctx->pos >= length) continue; dire->name[size] = '\0'; @@ -206,22 +200,12 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) ((short) le16_to_cpu(dire->inode_number)); type = le16_to_cpu(dire->type); - TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)" - "\n", dirent, dire->name, size, - file->f_pos, - le32_to_cpu(dirh.start_block), - le16_to_cpu(dire->offset), - inode_number, - squashfs_filetype_table[type]); - - if (filldir(dirent, dire->name, size, file->f_pos, + if (!dir_emit(ctx, dire->name, size, inode_number, - squashfs_filetype_table[type]) < 0) { - TRACE("Filldir returned less than 0\n"); + squashfs_filetype_table[type])) goto finish; - } - file->f_pos = length; + ctx->pos = length; } } @@ -238,6 +222,6 @@ failed_read: const struct file_operations squashfs_dir_ops = { .read = generic_read_dir, - .readdir = squashfs_readdir, + .iterate = squashfs_readdir, .llseek = default_llseek, }; diff --git a/fs/sync.c b/fs/sync.c index 2c5d6639a66a..905f3f6b3d85 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -283,8 +283,8 @@ EXPORT_SYMBOL(generic_write_sync); * already-instantiated disk blocks, there are no guarantees here that the data * will be available after a crash. */ -SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, - unsigned int flags) +SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, + unsigned int, flags) { int ret; struct fd f; @@ -365,29 +365,11 @@ out_put: out: return ret; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_sync_file_range(long fd, loff_t offset, loff_t nbytes, - long flags) -{ - return SYSC_sync_file_range((int) fd, offset, nbytes, - (unsigned int) flags); -} -SYSCALL_ALIAS(sys_sync_file_range, SyS_sync_file_range); -#endif /* It would be nice if people remember that not all the world's an i386 when they introduce new system calls */ -SYSCALL_DEFINE(sync_file_range2)(int fd, unsigned int flags, - loff_t offset, loff_t nbytes) +SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags, + loff_t, offset, loff_t, nbytes) { return sys_sync_file_range(fd, offset, nbytes, flags); } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_sync_file_range2(long fd, long flags, - loff_t offset, loff_t nbytes) -{ - return SYSC_sync_file_range2((int) fd, (unsigned int) flags, - offset, nbytes); -} -SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2); -#endif diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index e14512678c9b..e068e744dbdd 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -74,7 +74,7 @@ static int sysfs_sd_compare(const struct sysfs_dirent *left, } /** - * sysfs_link_subling - link sysfs_dirent into sibling rbtree + * sysfs_link_sibling - link sysfs_dirent into sibling rbtree * @sd: sysfs_dirent of interest * * Link @sd into its sibling rbtree which starts from @@ -165,21 +165,8 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) if (unlikely(!sd)) return NULL; - while (1) { - int v, t; - - v = atomic_read(&sd->s_active); - if (unlikely(v < 0)) - return NULL; - - t = atomic_cmpxchg(&sd->s_active, v, v + 1); - if (likely(t == v)) - break; - if (t < 0) - return NULL; - - cpu_relax(); - } + if (!atomic_inc_unless_negative(&sd->s_active)) + return NULL; if (likely(!ignore_lockdep(sd))) rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_); @@ -281,6 +268,10 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) */ parent_sd = sd->s_parent; + WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED), + "sysfs: free using entry: %s/%s\n", + parent_sd ? parent_sd->s_name : "", sd->s_name); + if (sysfs_type(sd) == SYSFS_KOBJ_LINK) sysfs_put(sd->s_symlink.target_sd); if (sysfs_type(sd) & SYSFS_COPY_NAME) @@ -399,7 +390,7 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) sd->s_name = name; sd->s_mode = mode; - sd->s_flags = type; + sd->s_flags = type | SYSFS_FLAG_REMOVED; return sd; @@ -479,6 +470,9 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; } + /* Mark the entry added into directory tree */ + sd->s_flags &= ~SYSFS_FLAG_REMOVED; + return 0; } @@ -1004,61 +998,38 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, return pos; } -static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) +static int sysfs_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file->f_path.dentry; struct sysfs_dirent * parent_sd = dentry->d_fsdata; - struct sysfs_dirent *pos = filp->private_data; + struct sysfs_dirent *pos = file->private_data; enum kobj_ns_type type; const void *ns; - ino_t ino; type = sysfs_ns_type(parent_sd); ns = sysfs_info(dentry->d_sb)->ns[type]; - if (filp->f_pos == 0) { - ino = parent_sd->s_ino; - if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) - filp->f_pos++; - else - return 0; - } - if (filp->f_pos == 1) { - if (parent_sd->s_parent) - ino = parent_sd->s_parent->s_ino; - else - ino = parent_sd->s_ino; - if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) - filp->f_pos++; - else - return 0; - } + if (!dir_emit_dots(file, ctx)) + return 0; mutex_lock(&sysfs_mutex); - for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); + for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos); pos; - pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) { - const char * name; - unsigned int type; - int len, ret; - - name = pos->s_name; - len = strlen(name); - ino = pos->s_ino; - type = dt_type(pos); - filp->f_pos = pos->s_hash; - filp->private_data = sysfs_get(pos); + pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) { + const char *name = pos->s_name; + unsigned int type = dt_type(pos); + int len = strlen(name); + ino_t ino = pos->s_ino; + ctx->pos = pos->s_hash; + file->private_data = sysfs_get(pos); mutex_unlock(&sysfs_mutex); - ret = filldir(dirent, name, len, filp->f_pos, ino, type); + if (!dir_emit(ctx, name, len, ino, type)) + return 0; mutex_lock(&sysfs_mutex); - if (ret < 0) - break; } mutex_unlock(&sysfs_mutex); - if ((filp->f_pos > 1) && !pos) { /* EOF */ - filp->f_pos = INT_MAX; - filp->private_data = NULL; - } + file->private_data = NULL; + ctx->pos = INT_MAX; return 0; } @@ -1076,7 +1047,7 @@ static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence) const struct file_operations sysfs_dir_operations = { .read = generic_read_dir, - .readdir = sysfs_readdir, + .iterate = sysfs_readdir, .release = sysfs_dir_release, .llseek = sysfs_dir_llseek, }; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 602f56db0442..d2bb7ed8fa74 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -449,10 +449,12 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd) spin_lock_irqsave(&sysfs_open_dirent_lock, flags); - od = sd->s_attr.open; - if (od) { - atomic_inc(&od->event); - wake_up_interruptible(&od->poll); + if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) { + od = sd->s_attr.open; + if (od) { + atomic_inc(&od->event); + wake_up_interruptible(&od->poll); + } } spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 0ce3ccf7f401..3e2837a633ed 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -24,8 +24,6 @@ #include <linux/security.h> #include "sysfs.h" -extern struct super_block * sysfs_sb; - static const struct address_space_operations sysfs_aops = { .readpage = simple_readpage, .write_begin = simple_write_begin, diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 3799e8dac3eb..d42291d08215 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -18,12 +18,12 @@ #include <linux/swap.h> #include "sysv.h" -static int sysv_readdir(struct file *, void *, filldir_t); +static int sysv_readdir(struct file *, struct dir_context *); const struct file_operations sysv_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = sysv_readdir, + .iterate = sysv_readdir, .fsync = generic_file_fsync, }; @@ -65,18 +65,21 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n) return page; } -static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) +static int sysv_readdir(struct file *file, struct dir_context *ctx) { - unsigned long pos = filp->f_pos; - struct inode *inode = file_inode(filp); + unsigned long pos = ctx->pos; + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - unsigned offset = pos & ~PAGE_CACHE_MASK; - unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = dir_pages(inode); + unsigned offset; + unsigned long n; - pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1); + ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1); if (pos >= inode->i_size) - goto done; + return 0; + + offset = pos & ~PAGE_CACHE_MASK; + n = pos >> PAGE_CACHE_SHIFT; for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; @@ -88,29 +91,21 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) kaddr = (char *)page_address(page); de = (struct sysv_dir_entry *)(kaddr+offset); limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE; - for ( ;(char*)de <= limit; de++) { + for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) { char *name = de->name; - int over; if (!de->inode) continue; - offset = (char *)de - kaddr; - - over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN), - ((loff_t)n<<PAGE_CACHE_SHIFT) | offset, + if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), fs16_to_cpu(SYSV_SB(sb), de->inode), - DT_UNKNOWN); - if (over) { + DT_UNKNOWN)) { dir_put_page(page); - goto done; + return 0; } } dir_put_page(page); } - -done: - filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset; return 0; } diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 1c0d5f264767..731b2bbcaab3 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -27,8 +27,7 @@ static int add_nondir(struct dentry *dentry, struct inode *inode) return err; } -static int sysv_hash(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +static int sysv_hash(const struct dentry *dentry, struct qstr *qstr) { /* Truncate the name in place, avoids having to define a compare function. */ diff --git a/fs/timerfd.c b/fs/timerfd.c index 32b644f03690..929312180dd0 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -8,6 +8,7 @@ * */ +#include <linux/alarmtimer.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/init.h> @@ -26,7 +27,10 @@ #include <linux/rcupdate.h> struct timerfd_ctx { - struct hrtimer tmr; + union { + struct hrtimer tmr; + struct alarm alarm; + } t; ktime_t tintv; ktime_t moffs; wait_queue_head_t wqh; @@ -41,14 +45,19 @@ struct timerfd_ctx { static LIST_HEAD(cancel_list); static DEFINE_SPINLOCK(cancel_lock); +static inline bool isalarm(struct timerfd_ctx *ctx) +{ + return ctx->clockid == CLOCK_REALTIME_ALARM || + ctx->clockid == CLOCK_BOOTTIME_ALARM; +} + /* * This gets called when the timer event triggers. We set the "expired" * flag, but we do not re-arm the timer (in case it's necessary, * tintv.tv64 != 0) until the timer is accessed. */ -static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) +static void timerfd_triggered(struct timerfd_ctx *ctx) { - struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr); unsigned long flags; spin_lock_irqsave(&ctx->wqh.lock, flags); @@ -56,10 +65,25 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) ctx->ticks++; wake_up_locked(&ctx->wqh); spin_unlock_irqrestore(&ctx->wqh.lock, flags); +} +static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) +{ + struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, + t.tmr); + timerfd_triggered(ctx); return HRTIMER_NORESTART; } +static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm, + ktime_t now) +{ + struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx, + t.alarm); + timerfd_triggered(ctx); + return ALARMTIMER_NORESTART; +} + /* * Called when the clock was set to cancel the timers in the cancel * list. This will wake up processes waiting on these timers. The @@ -107,8 +131,9 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx) static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) { - if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) && - (flags & TFD_TIMER_CANCEL_ON_SET)) { + if ((ctx->clockid == CLOCK_REALTIME || + ctx->clockid == CLOCK_REALTIME_ALARM) && + (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) { if (!ctx->might_cancel) { ctx->might_cancel = true; spin_lock(&cancel_lock); @@ -124,7 +149,11 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) { ktime_t remaining; - remaining = hrtimer_expires_remaining(&ctx->tmr); + if (isalarm(ctx)) + remaining = alarm_expires_remaining(&ctx->t.alarm); + else + remaining = hrtimer_expires_remaining(&ctx->t.tmr); + return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; } @@ -142,11 +171,28 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, ctx->expired = 0; ctx->ticks = 0; ctx->tintv = timespec_to_ktime(ktmr->it_interval); - hrtimer_init(&ctx->tmr, clockid, htmode); - hrtimer_set_expires(&ctx->tmr, texp); - ctx->tmr.function = timerfd_tmrproc; + + if (isalarm(ctx)) { + alarm_init(&ctx->t.alarm, + ctx->clockid == CLOCK_REALTIME_ALARM ? + ALARM_REALTIME : ALARM_BOOTTIME, + timerfd_alarmproc); + } else { + hrtimer_init(&ctx->t.tmr, clockid, htmode); + hrtimer_set_expires(&ctx->t.tmr, texp); + ctx->t.tmr.function = timerfd_tmrproc; + } + if (texp.tv64 != 0) { - hrtimer_start(&ctx->tmr, texp, htmode); + if (isalarm(ctx)) { + if (flags & TFD_TIMER_ABSTIME) + alarm_start(&ctx->t.alarm, texp); + else + alarm_start_relative(&ctx->t.alarm, texp); + } else { + hrtimer_start(&ctx->t.tmr, texp, htmode); + } + if (timerfd_canceled(ctx)) return -ECANCELED; } @@ -158,7 +204,11 @@ static int timerfd_release(struct inode *inode, struct file *file) struct timerfd_ctx *ctx = file->private_data; timerfd_remove_cancel(ctx); - hrtimer_cancel(&ctx->tmr); + + if (isalarm(ctx)) + alarm_cancel(&ctx->t.alarm); + else + hrtimer_cancel(&ctx->t.tmr); kfree_rcu(ctx, rcu); return 0; } @@ -215,9 +265,15 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, * callback to avoid DoS attacks specifying a very * short timer period. */ - ticks += hrtimer_forward_now(&ctx->tmr, - ctx->tintv) - 1; - hrtimer_restart(&ctx->tmr); + if (isalarm(ctx)) { + ticks += alarm_forward_now( + &ctx->t.alarm, ctx->tintv) - 1; + alarm_restart(&ctx->t.alarm); + } else { + ticks += hrtimer_forward_now(&ctx->t.tmr, + ctx->tintv) - 1; + hrtimer_restart(&ctx->t.tmr); + } } ctx->expired = 0; ctx->ticks = 0; @@ -259,7 +315,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) if ((flags & ~TFD_CREATE_FLAGS) || (clockid != CLOCK_MONOTONIC && - clockid != CLOCK_REALTIME)) + clockid != CLOCK_REALTIME && + clockid != CLOCK_REALTIME_ALARM && + clockid != CLOCK_BOOTTIME_ALARM)) return -EINVAL; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); @@ -268,7 +326,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) init_waitqueue_head(&ctx->wqh); ctx->clockid = clockid; - hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); + + if (isalarm(ctx)) + alarm_init(&ctx->t.alarm, + ctx->clockid == CLOCK_REALTIME_ALARM ? + ALARM_REALTIME : ALARM_BOOTTIME, + timerfd_alarmproc); + else + hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS); + ctx->moffs = ktime_get_monotonic_offset(); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, @@ -305,8 +371,14 @@ static int do_timerfd_settime(int ufd, int flags, */ for (;;) { spin_lock_irq(&ctx->wqh.lock); - if (hrtimer_try_to_cancel(&ctx->tmr) >= 0) - break; + + if (isalarm(ctx)) { + if (alarm_try_to_cancel(&ctx->t.alarm) >= 0) + break; + } else { + if (hrtimer_try_to_cancel(&ctx->t.tmr) >= 0) + break; + } spin_unlock_irq(&ctx->wqh.lock); cpu_relax(); } @@ -317,8 +389,12 @@ static int do_timerfd_settime(int ufd, int flags, * We do not update "ticks" and "expired" since the timer will be * re-programmed again in the following timerfd_setup() call. */ - if (ctx->expired && ctx->tintv.tv64) - hrtimer_forward_now(&ctx->tmr, ctx->tintv); + if (ctx->expired && ctx->tintv.tv64) { + if (isalarm(ctx)) + alarm_forward_now(&ctx->t.alarm, ctx->tintv); + else + hrtimer_forward_now(&ctx->t.tmr, ctx->tintv); + } old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); old->it_interval = ktime_to_timespec(ctx->tintv); @@ -345,9 +421,18 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t) spin_lock_irq(&ctx->wqh.lock); if (ctx->expired && ctx->tintv.tv64) { ctx->expired = 0; - ctx->ticks += - hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1; - hrtimer_restart(&ctx->tmr); + + if (isalarm(ctx)) { + ctx->ticks += + alarm_forward_now( + &ctx->t.alarm, ctx->tintv) - 1; + alarm_restart(&ctx->t.alarm); + } else { + ctx->ticks += + hrtimer_forward_now(&ctx->t.tmr, ctx->tintv) + - 1; + hrtimer_restart(&ctx->t.tmr); + } } t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); t->it_interval = ktime_to_timespec(ctx->tintv); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index de08c92f2e23..6b4947f75af7 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -346,38 +346,46 @@ static unsigned int vfs_dent_type(uint8_t type) * This means that UBIFS cannot support NFS which requires full * 'seekdir()'/'telldir()' support. */ -static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) +static int ubifs_readdir(struct file *file, struct dir_context *ctx) { - int err, over = 0; + int err; struct qstr nm; union ubifs_key key; struct ubifs_dent_node *dent; struct inode *dir = file_inode(file); struct ubifs_info *c = dir->i_sb->s_fs_info; - dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos); + dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos); - if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2) + if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2) /* * The directory was seek'ed to a senseless position or there * are no more entries. */ return 0; - /* File positions 0 and 1 correspond to "." and ".." */ - if (file->f_pos == 0) { - ubifs_assert(!file->private_data); - over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR); - if (over) - return 0; - file->f_pos = 1; + if (file->f_version == 0) { + /* + * The file was seek'ed, which means that @file->private_data + * is now invalid. This may also be just the first + * 'ubifs_readdir()' invocation, in which case + * @file->private_data is NULL, and the below code is + * basically a no-op. + */ + kfree(file->private_data); + file->private_data = NULL; } - if (file->f_pos == 1) { + /* + * 'generic_file_llseek()' unconditionally sets @file->f_version to + * zero, and we use this for detecting whether the file was seek'ed. + */ + file->f_version = 1; + + /* File positions 0 and 1 correspond to "." and ".." */ + if (ctx->pos < 2) { ubifs_assert(!file->private_data); - over = filldir(dirent, "..", 2, 1, - parent_ino(file->f_path.dentry), DT_DIR); - if (over) + if (!dir_emit_dots(file, ctx)) return 0; /* Find the first entry in TNC and save it */ @@ -389,7 +397,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) goto out; } - file->f_pos = key_hash_flash(c, &dent->key); + ctx->pos = key_hash_flash(c, &dent->key); file->private_data = dent; } @@ -397,17 +405,16 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) if (!dent) { /* * The directory was seek'ed to and is now readdir'ed. - * Find the entry corresponding to @file->f_pos or the - * closest one. + * Find the entry corresponding to @ctx->pos or the closest one. */ - dent_key_init_hash(c, &key, dir->i_ino, file->f_pos); + dent_key_init_hash(c, &key, dir->i_ino, ctx->pos); nm.name = NULL; dent = ubifs_tnc_next_ent(c, &key, &nm); if (IS_ERR(dent)) { err = PTR_ERR(dent); goto out; } - file->f_pos = key_hash_flash(c, &dent->key); + ctx->pos = key_hash_flash(c, &dent->key); file->private_data = dent; } @@ -419,10 +426,9 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) ubifs_inode(dir)->creat_sqnum); nm.len = le16_to_cpu(dent->nlen); - over = filldir(dirent, dent->name, nm.len, file->f_pos, + if (!dir_emit(ctx, dent->name, nm.len, le64_to_cpu(dent->inum), - vfs_dent_type(dent->type)); - if (over) + vfs_dent_type(dent->type))) return 0; /* Switch to the next entry */ @@ -435,7 +441,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) } kfree(file->private_data); - file->f_pos = key_hash_flash(c, &dent->key); + ctx->pos = key_hash_flash(c, &dent->key); file->private_data = dent; cond_resched(); } @@ -448,18 +454,11 @@ out: kfree(file->private_data); file->private_data = NULL; - file->f_pos = 2; + /* 2 is a special value indicating that there are no more direntries */ + ctx->pos = 2; return 0; } -/* If a directory is seeked, we have to free saved readdir() state */ -static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence) -{ - kfree(file->private_data); - file->private_data = NULL; - return generic_file_llseek(file, offset, whence); -} - /* Free saved readdir() state when the directory is closed */ static int ubifs_dir_release(struct inode *dir, struct file *file) { @@ -1177,10 +1176,10 @@ const struct inode_operations ubifs_dir_inode_operations = { }; const struct file_operations ubifs_dir_operations = { - .llseek = ubifs_dir_llseek, + .llseek = generic_file_llseek, .release = ubifs_dir_release, .read = generic_read_dir, - .readdir = ubifs_readdir, + .iterate = ubifs_readdir, .fsync = ubifs_fsync, .unlocked_ioctl = ubifs_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f12189d2db1d..123c79b7261e 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -50,6 +50,7 @@ */ #include "ubifs.h" +#include <linux/aio.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/slab.h> @@ -1276,13 +1277,14 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr) return err; } -static void ubifs_invalidatepage(struct page *page, unsigned long offset) +static void ubifs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode = page->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; ubifs_assert(PagePrivate(page)); - if (offset) + if (offset || length < PAGE_CACHE_SIZE) /* Partial page remains dirty */ return; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f21acf0ef01f..879b9976c12b 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1412,7 +1412,7 @@ static int mount_ubifs(struct ubifs_info *c) ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s", c->vi.ubi_num, c->vi.vol_id, c->vi.name, - c->ro_mount ? ", R/O mode" : NULL); + c->ro_mount ? ", R/O mode" : ""); x = (long long)c->main_lebs * c->leb_size; y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes", diff --git a/fs/udf/dir.c b/fs/udf/dir.c index b3e93f5e17c3..a012c51caffd 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -35,14 +35,16 @@ #include "udf_i.h" #include "udf_sb.h" -static int do_udf_readdir(struct inode *dir, struct file *filp, - filldir_t filldir, void *dirent) + +static int udf_readdir(struct file *file, struct dir_context *ctx) { + struct inode *dir = file_inode(file); + struct udf_inode_info *iinfo = UDF_I(dir); struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL}; struct fileIdentDesc *fi = NULL; struct fileIdentDesc cfi; int block, iblock; - loff_t nf_pos = (filp->f_pos - 1) << 2; + loff_t nf_pos; int flen; unsigned char *fname = NULL; unsigned char *nameptr; @@ -54,10 +56,14 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, uint32_t elen; sector_t offset; int i, num, ret = 0; - unsigned int dt_type; struct extent_position epos = { NULL, 0, {0, 0} }; - struct udf_inode_info *iinfo; + if (ctx->pos == 0) { + if (!dir_emit_dot(file, ctx)) + return 0; + ctx->pos = 1; + } + nf_pos = (ctx->pos - 1) << 2; if (nf_pos >= size) goto out; @@ -71,7 +77,6 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, nf_pos = udf_ext0_offset(dir); fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1); - iinfo = UDF_I(dir); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits, &epos, &eloc, &elen, &offset) @@ -116,7 +121,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, } while (nf_pos < size) { - filp->f_pos = (nf_pos >> 2) + 1; + struct kernel_lb_addr tloc; + + ctx->pos = (nf_pos >> 2) + 1; fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, &elen, &offset); @@ -155,24 +162,22 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, } if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) { - iblock = parent_ino(filp->f_path.dentry); - flen = 2; - memcpy(fname, "..", flen); - dt_type = DT_DIR; - } else { - struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation); - - iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0); - flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); - dt_type = DT_UNKNOWN; + if (!dir_emit_dotdot(file, ctx)) + goto out; + continue; } - if (flen && filldir(dirent, fname, flen, filp->f_pos, - iblock, dt_type) < 0) + flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); + if (!flen) + continue; + + tloc = lelb_to_cpu(cfi.icb.extLocation); + iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0); + if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN)) goto out; } /* end while */ - filp->f_pos = (nf_pos >> 2) + 1; + ctx->pos = (nf_pos >> 2) + 1; out: if (fibh.sbh != fibh.ebh) @@ -184,27 +189,11 @@ out: return ret; } -static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir) -{ - struct inode *dir = file_inode(filp); - int result; - - if (filp->f_pos == 0) { - if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) { - return 0; - } - filp->f_pos++; - } - - result = do_udf_readdir(dir, filp, filldir, dirent); - return result; -} - /* readdir and lookup functions */ const struct file_operations udf_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = udf_readdir, + .iterate = udf_readdir, .unlocked_ioctl = udf_ioctl, .fsync = generic_file_fsync, }; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7a12e48ad819..b6d15d349810 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -38,6 +38,7 @@ #include <linux/slab.h> #include <linux/crc-itu-t.h> #include <linux/mpage.h> +#include <linux/aio.h> #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 102c072c6bbf..5f6fc17d6bc5 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -594,6 +594,29 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, return 0; } +static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + struct udf_inode_info *iinfo; + int err; + + inode = udf_new_inode(dir, mode, &err); + if (!inode) + return err; + + iinfo = UDF_I(inode); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + inode->i_data.a_ops = &udf_adinicb_aops; + else + inode->i_data.a_ops = &udf_aops; + inode->i_op = &udf_file_inode_operations; + inode->i_fop = &udf_file_operations; + mark_inode_dirty(inode); + + d_tmpfile(dentry, inode); + return 0; +} + static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { @@ -1311,6 +1334,7 @@ const struct inode_operations udf_dir_inode_operations = { .rmdir = udf_rmdir, .mknod = udf_mknod, .rename = udf_rename, + .tmpfile = udf_tmpfile, }; const struct inode_operations udf_symlink_inode_operations = { .readlink = generic_readlink, diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 3a75ca09c506..0ecc2cebed8f 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -430,16 +430,16 @@ ufs_validate_entry(struct super_block *sb, char *base, * This is blatantly stolen from ext2fs */ static int -ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) +ufs_readdir(struct file *file, struct dir_context *ctx) { - loff_t pos = filp->f_pos; - struct inode *inode = file_inode(filp); + loff_t pos = ctx->pos; + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = ufs_dir_pages(inode); unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); - int need_revalidate = filp->f_version != inode->i_version; + int need_revalidate = file->f_version != inode->i_version; unsigned flags = UFS_SB(sb)->s_flags; UFSD("BEGIN\n"); @@ -457,16 +457,16 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) ufs_error(sb, __func__, "bad page in #%lu", inode->i_ino); - filp->f_pos += PAGE_CACHE_SIZE - offset; + ctx->pos += PAGE_CACHE_SIZE - offset; return -EIO; } kaddr = page_address(page); if (unlikely(need_revalidate)) { if (offset) { offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask); - filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; + ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset; } - filp->f_version = inode->i_version; + file->f_version = inode->i_version; need_revalidate = 0; } de = (struct ufs_dir_entry *)(kaddr+offset); @@ -479,11 +479,8 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) return -EIO; } if (de->d_ino) { - int over; unsigned char d_type = DT_UNKNOWN; - offset = (char *)de - kaddr; - UFSD("filldir(%s,%u)\n", de->d_name, fs32_to_cpu(sb, de->d_ino)); UFSD("namlen %u\n", ufs_get_de_namlen(sb, de)); @@ -491,16 +488,15 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) if ((flags & UFS_DE_MASK) == UFS_DE_44BSD) d_type = de->d_u.d_44.d_type; - over = filldir(dirent, de->d_name, + if (!dir_emit(ctx, de->d_name, ufs_get_de_namlen(sb, de), - (n<<PAGE_CACHE_SHIFT) | offset, - fs32_to_cpu(sb, de->d_ino), d_type); - if (over) { + fs32_to_cpu(sb, de->d_ino), + d_type)) { ufs_put_page(page); return 0; } } - filp->f_pos += fs16_to_cpu(sb, de->d_reclen); + ctx->pos += fs16_to_cpu(sb, de->d_reclen); } ufs_put_page(page); } @@ -660,7 +656,7 @@ not_empty: const struct file_operations ufs_dir_operations = { .read = generic_read_dir, - .readdir = ufs_readdir, + .iterate = ufs_readdir, .fsync = generic_file_fsync, .llseek = generic_file_llseek, }; diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 95425b59ce0a..b6c2f94e041e 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -26,8 +26,7 @@ struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi, count = size >> uspi->s_fshift; if (count > UFS_MAXFRAG) return NULL; - ubh = (struct ufs_buffer_head *) - kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS); + ubh = kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS); if (!ubh) return NULL; ubh->fragment = fragment; diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index cc33aaf219f1..399e8cec6e60 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -69,6 +69,19 @@ config XFS_RT If unsure, say N. +config XFS_WARN + bool "XFS Verbose Warnings" + depends on XFS_FS && !XFS_DEBUG + help + Say Y here to get an XFS build with many additional warnings. + It converts ASSERT checks to WARN, so will log any out-of-bounds + conditions that occur that would otherwise be missed. It is much + lighter weight than XFS_DEBUG and does not modify algorithms and will + not cause the kernel to panic on non-fatal errors. + + However, similar to XFS_DEBUG, it is only advisable to use this if you + are debugging a particular problem. + config XFS_DEBUG bool "XFS Debugging support" depends on XFS_FS diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index d02201df855b..4a4508023a3c 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -45,11 +45,11 @@ xfs-y += xfs_aops.o \ xfs_itable.o \ xfs_message.o \ xfs_mru_cache.o \ - xfs_super.o \ - xfs_xattr.o \ xfs_rename.o \ + xfs_super.o \ xfs_utils.o \ xfs_vnodeops.o \ + xfs_xattr.o \ kmem.o \ uuid.o @@ -58,6 +58,7 @@ xfs-y += xfs_alloc.o \ xfs_alloc_btree.o \ xfs_attr.o \ xfs_attr_leaf.o \ + xfs_attr_remote.o \ xfs_bmap.o \ xfs_bmap_btree.o \ xfs_btree.o \ @@ -70,9 +71,11 @@ xfs-y += xfs_alloc.o \ xfs_dir2_sf.o \ xfs_ialloc.o \ xfs_ialloc_btree.o \ + xfs_icreate_item.o \ xfs_inode.o \ xfs_log_recover.o \ xfs_mount.o \ + xfs_symlink.o \ xfs_trans.o # low-level transaction/log code diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h index ff6a19873e5c..e3c92d19e540 100644 --- a/fs/xfs/mrlock.h +++ b/fs/xfs/mrlock.h @@ -22,12 +22,12 @@ typedef struct { struct rw_semaphore mr_lock; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int mr_writer; #endif } mrlock_t; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) #define mrinit(mrp, name) \ do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) #else @@ -46,7 +46,7 @@ static inline void mraccess_nested(mrlock_t *mrp, int subclass) static inline void mrupdate_nested(mrlock_t *mrp, int subclass) { down_write_nested(&mrp->mr_lock, subclass); -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 1; #endif } @@ -60,7 +60,7 @@ static inline int mrtryupdate(mrlock_t *mrp) { if (!down_write_trylock(&mrp->mr_lock)) return 0; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 1; #endif return 1; @@ -68,7 +68,7 @@ static inline int mrtryupdate(mrlock_t *mrp) static inline void mrunlock_excl(mrlock_t *mrp) { -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 0; #endif up_write(&mrp->mr_lock); @@ -81,7 +81,7 @@ static inline void mrunlock_shared(mrlock_t *mrp) static inline void mrdemote(mrlock_t *mrp) { -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 0; #endif downgrade_write(&mrp->mr_lock); diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index d8b11b7f94aa..a742c47f7d5a 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -24,6 +24,11 @@ #define XFS_BUF_LOCK_TRACKING 1 #endif +#ifdef CONFIG_XFS_WARN +#define XFS_WARN 1 +#endif + + #include "xfs_linux.h" #endif /* __XFS_H__ */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 1d32f1d52763..306d883d89bc 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -21,6 +21,8 @@ #include "xfs_bmap_btree.h" #include "xfs_inode.h" #include "xfs_vnodeops.h" +#include "xfs_sb.h" +#include "xfs_mount.h" #include "xfs_trace.h" #include <linux/slab.h> #include <linux/xattr.h> @@ -34,7 +36,9 @@ */ STATIC struct posix_acl * -xfs_acl_from_disk(struct xfs_acl *aclp) +xfs_acl_from_disk( + struct xfs_acl *aclp, + int max_entries) { struct posix_acl_entry *acl_e; struct posix_acl *acl; @@ -42,7 +46,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp) unsigned int count, i; count = be32_to_cpu(aclp->acl_cnt); - if (count > XFS_ACL_MAX_ENTRIES) + if (count > max_entries) return ERR_PTR(-EFSCORRUPTED); acl = posix_acl_alloc(count, GFP_KERNEL); @@ -108,9 +112,9 @@ xfs_get_acl(struct inode *inode, int type) struct xfs_inode *ip = XFS_I(inode); struct posix_acl *acl; struct xfs_acl *xfs_acl; - int len = sizeof(struct xfs_acl); unsigned char *ea_name; int error; + int len; acl = get_cached_acl(inode, type); if (acl != ACL_NOT_CACHED) @@ -133,8 +137,8 @@ xfs_get_acl(struct inode *inode, int type) * If we have a cached ACLs value just return it, not need to * go out to the disk. */ - - xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); + len = XFS_ACL_MAX_SIZE(ip->i_mount); + xfs_acl = kzalloc(len, GFP_KERNEL); if (!xfs_acl) return ERR_PTR(-ENOMEM); @@ -153,7 +157,7 @@ xfs_get_acl(struct inode *inode, int type) goto out; } - acl = xfs_acl_from_disk(xfs_acl); + acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount)); if (IS_ERR(acl)) goto out; @@ -189,16 +193,17 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) if (acl) { struct xfs_acl *xfs_acl; - int len; + int len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); + xfs_acl = kzalloc(len, GFP_KERNEL); if (!xfs_acl) return -ENOMEM; xfs_acl_to_disk(xfs_acl, acl); - len = sizeof(struct xfs_acl) - - (sizeof(struct xfs_acl_entry) * - (XFS_ACL_MAX_ENTRIES - acl->a_count)); + + /* subtract away the unused acl entries */ + len -= sizeof(struct xfs_acl_entry) * + (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, len, ATTR_ROOT); @@ -243,7 +248,7 @@ xfs_set_mode(struct inode *inode, umode_t mode) static int xfs_acl_exists(struct inode *inode, unsigned char *name) { - int len = sizeof(struct xfs_acl); + int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb)); return (xfs_attr_get(XFS_I(inode), name, NULL, &len, ATTR_ROOT|ATTR_KERNOVAL) == 0); @@ -379,7 +384,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, goto out_release; error = -EINVAL; - if (acl->a_count > XFS_ACL_MAX_ENTRIES) + if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) goto out_release; if (type == ACL_TYPE_ACCESS) { diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 39632d941354..4016a567b83c 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -22,19 +22,36 @@ struct inode; struct posix_acl; struct xfs_inode; -#define XFS_ACL_MAX_ENTRIES 25 #define XFS_ACL_NOT_PRESENT (-1) /* On-disk XFS access control list structure */ +struct xfs_acl_entry { + __be32 ae_tag; + __be32 ae_id; + __be16 ae_perm; + __be16 ae_pad; /* fill the implicit hole in the structure */ +}; + struct xfs_acl { - __be32 acl_cnt; - struct xfs_acl_entry { - __be32 ae_tag; - __be32 ae_id; - __be16 ae_perm; - } acl_entry[XFS_ACL_MAX_ENTRIES]; + __be32 acl_cnt; + struct xfs_acl_entry acl_entry[0]; }; +/* + * The number of ACL entries allowed is defined by the on-disk format. + * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is + * limited only by the maximum size of the xattr that stores the information. + */ +#define XFS_ACL_MAX_ENTRIES(mp) \ + (xfs_sb_version_hascrc(&mp->m_sb) \ + ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ + sizeof(struct xfs_acl_entry) \ + : 25) + +#define XFS_ACL_MAX_SIZE(mp) \ + (sizeof(struct xfs_acl) + \ + sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) + /* On-disk XFS extended attribute names */ #define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" #define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index f2aeedb6a579..317aa86d96ea 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -30,6 +30,7 @@ struct xfs_trans; #define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */ #define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */ +#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */ #define XFS_AGF_VERSION 1 #define XFS_AGI_VERSION 1 @@ -63,12 +64,29 @@ typedef struct xfs_agf { __be32 agf_spare0; /* spare field */ __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ __be32 agf_spare1; /* spare field */ + __be32 agf_flfirst; /* first freelist block's index */ __be32 agf_fllast; /* last freelist block's index */ __be32 agf_flcount; /* count of blocks in freelist */ __be32 agf_freeblks; /* total free blocks */ + __be32 agf_longest; /* longest free space */ __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ + uuid_t agf_uuid; /* uuid of filesystem */ + + /* + * reserve some contiguous space for future logged fields before we add + * the unlogged fields. This makes the range logging via flags and + * structure offsets much simpler. + */ + __be64 agf_spare64[16]; + + /* unlogged fields, written during buffer writeback. */ + __be64 agf_lsn; /* last write sequence */ + __be32 agf_crc; /* crc of agf sector */ + __be32 agf_spare2; + + /* structure must be padded to 64 bit alignment */ } xfs_agf_t; #define XFS_AGF_MAGICNUM 0x00000001 @@ -83,7 +101,8 @@ typedef struct xfs_agf { #define XFS_AGF_FREEBLKS 0x00000200 #define XFS_AGF_LONGEST 0x00000400 #define XFS_AGF_BTREEBLKS 0x00000800 -#define XFS_AGF_NUM_BITS 12 +#define XFS_AGF_UUID 0x00001000 +#define XFS_AGF_NUM_BITS 13 #define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ @@ -98,7 +117,8 @@ typedef struct xfs_agf { { XFS_AGF_FLCOUNT, "FLCOUNT" }, \ { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ { XFS_AGF_LONGEST, "LONGEST" }, \ - { XFS_AGF_BTREEBLKS, "BTREEBLKS" } + { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ + { XFS_AGF_UUID, "UUID" } /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) @@ -132,6 +152,7 @@ typedef struct xfs_agi { __be32 agi_root; /* root of inode btree */ __be32 agi_level; /* levels in inode btree */ __be32 agi_freecount; /* number of free inodes */ + __be32 agi_newino; /* new inode just allocated */ __be32 agi_dirino; /* last directory inode chunk */ /* @@ -139,6 +160,13 @@ typedef struct xfs_agi { * still being referenced. */ __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; + + uuid_t agi_uuid; /* uuid of filesystem */ + __be32 agi_crc; /* crc of agi sector */ + __be32 agi_pad32; + __be64 agi_lsn; /* last write sequence */ + + /* structure must be padded to 64 bit alignment */ } xfs_agi_t; #define XFS_AGI_MAGICNUM 0x00000001 @@ -171,11 +199,31 @@ extern const struct xfs_buf_ops xfs_agi_buf_ops; */ #define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) #define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) -#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t)) #define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) +#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ + (__be32 *)(bp)->b_addr) + +/* + * Size of the AGFL. For CRC-enabled filesystes we steal a couple of + * slots in the beginning of the block for a proper header with the + * location information and CRC. + */ +#define XFS_AGFL_SIZE(mp) \ + (((mp)->m_sb.sb_sectsize - \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + sizeof(struct xfs_agfl) : 0)) / \ + sizeof(xfs_agblock_t)) + typedef struct xfs_agfl { - __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ + __be32 agfl_magicnum; + __be32 agfl_seqno; + uuid_t agfl_uuid; + __be64 agfl_lsn; + __be32 agfl_crc; + __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ } xfs_agfl_t; /* diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 0ad23253e8b1..71596e57283a 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -33,7 +33,9 @@ #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_error.h" +#include "xfs_cksum.h" #include "xfs_trace.h" +#include "xfs_buf_item.h" struct workqueue_struct *xfs_alloc_wq; @@ -173,6 +175,7 @@ xfs_alloc_compute_diff( xfs_agblock_t wantbno, /* target starting block */ xfs_extlen_t wantlen, /* target length */ xfs_extlen_t alignment, /* target alignment */ + char userdata, /* are we allocating data? */ xfs_agblock_t freebno, /* freespace's starting block */ xfs_extlen_t freelen, /* freespace's length */ xfs_agblock_t *newbnop) /* result: best start block from free */ @@ -187,7 +190,14 @@ xfs_alloc_compute_diff( ASSERT(freelen >= wantlen); freeend = freebno + freelen; wantend = wantbno + wantlen; - if (freebno >= wantbno) { + /* + * We want to allocate from the start of a free extent if it is past + * the desired block or if we are allocating user data and the free + * extent is before desired block. The second case is there to allow + * for contiguous allocation from the remaining free space if the file + * grows in the short term. + */ + if (freebno >= wantbno || (userdata && freeend < wantend)) { if ((newbno1 = roundup(freebno, alignment)) >= freeend) newbno1 = NULLAGBLOCK; } else if (freeend >= wantend && alignment > 1) { @@ -430,53 +440,84 @@ xfs_alloc_fixup_trees( return 0; } -static void +static bool xfs_agfl_verify( struct xfs_buf *bp) { -#ifdef WHEN_CRCS_COME_ALONG - /* - * we cannot actually do any verification of the AGFL because mkfs does - * not initialise the AGFL to zero or NULL. Hence the only valid part of - * the AGFL is what the AGF says is active. We can't get to the AGF, so - * we can't verify just those entries are valid. - * - * This problem goes away when the CRC format change comes along as that - * requires the AGFL to be initialised by mkfs. At that point, we can - * verify the blocks in the agfl -active or not- lie within the bounds - * of the AG. Until then, just leave this check ifdef'd out. - */ struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); - int agfl_ok = 1; - int i; + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) + return false; + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) + return false; + for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { - if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK || + if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) - agfl_ok = 0; + return false; } + return true; +} + +static void +xfs_agfl_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int agfl_ok = 1; + + /* + * There is no verification of non-crc AGFLs because mkfs does not + * initialise the AGFL to zero or NULL. Hence the only valid part of the + * AGFL is what the AGF says is active. We can't get to the AGF, so we + * can't verify just those entries are valid. + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + agfl_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agfl, agfl_crc)); + + agfl_ok = agfl_ok && xfs_agfl_verify(bp); if (!agfl_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); } -#endif } static void xfs_agfl_write_verify( struct xfs_buf *bp) { - xfs_agfl_verify(bp); -} + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; -static void -xfs_agfl_read_verify( - struct xfs_buf *bp) -{ - xfs_agfl_verify(bp); + /* no verification of non-crc AGFLs */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_agfl_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (bip) + XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agfl, agfl_crc)); } const struct xfs_buf_ops xfs_agfl_buf_ops = { @@ -772,7 +813,8 @@ xfs_alloc_find_best_extent( xfs_alloc_fix_len(args); sdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, *sbnoa, + args->alignment, + args->userdata, *sbnoa, *slena, &new); /* @@ -842,7 +884,7 @@ xfs_alloc_ag_vextent_near( */ int dofirst; /* set to do first algorithm */ - dofirst = random32() & 1; + dofirst = prandom_u32() & 1; #endif restart: @@ -943,7 +985,8 @@ restart: if (args->len < blen) continue; ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, ltbnoa, ltlena, <new); + args->alignment, args->userdata, ltbnoa, + ltlena, <new); if (ltnew != NULLAGBLOCK && (args->len > blen || ltdiff < bdiff)) { bdiff = ltdiff; @@ -1095,7 +1138,8 @@ restart: args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, ltbnoa, ltlena, <new); + args->alignment, args->userdata, ltbnoa, + ltlena, <new); error = xfs_alloc_find_best_extent(args, &bno_cur_lt, &bno_cur_gt, @@ -1111,7 +1155,8 @@ restart: args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); xfs_alloc_fix_len(args); gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, gtbnoa, gtlena, >new); + args->alignment, args->userdata, gtbnoa, + gtlena, >new); error = xfs_alloc_find_best_extent(args, &bno_cur_gt, &bno_cur_lt, @@ -1170,7 +1215,7 @@ restart: } rlen = args->len; (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, - ltbnoa, ltlena, <new); + args->userdata, ltbnoa, ltlena, <new); ASSERT(ltnew >= ltbno); ASSERT(ltnew + rlen <= ltbnoa + ltlena); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); @@ -1982,18 +2027,18 @@ xfs_alloc_get_freelist( int btreeblk) /* destination is a AGF btree */ { xfs_agf_t *agf; /* a.g. freespace structure */ - xfs_agfl_t *agfl; /* a.g. freelist structure */ xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ xfs_agblock_t bno; /* block number returned */ + __be32 *agfl_bno; int error; int logflags; - xfs_mount_t *mp; /* mount structure */ + xfs_mount_t *mp = tp->t_mountp; xfs_perag_t *pag; /* per allocation group data */ - agf = XFS_BUF_TO_AGF(agbp); /* * Freelist is empty, give up. */ + agf = XFS_BUF_TO_AGF(agbp); if (!agf->agf_flcount) { *bnop = NULLAGBLOCK; return 0; @@ -2001,15 +2046,17 @@ xfs_alloc_get_freelist( /* * Read the array of free blocks. */ - mp = tp->t_mountp; - if ((error = xfs_alloc_read_agfl(mp, tp, - be32_to_cpu(agf->agf_seqno), &agflbp))) + error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), + &agflbp); + if (error) return error; - agfl = XFS_BUF_TO_AGFL(agflbp); + + /* * Get the block number and update the data structures. */ - bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]); + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) @@ -2058,11 +2105,14 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_freeblks), offsetof(xfs_agf_t, agf_longest), offsetof(xfs_agf_t, agf_btreeblks), + offsetof(xfs_agf_t, agf_uuid), sizeof(xfs_agf_t) }; trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); + xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); } @@ -2099,12 +2149,13 @@ xfs_alloc_put_freelist( int btreeblk) /* block came from a AGF btree */ { xfs_agf_t *agf; /* a.g. freespace structure */ - xfs_agfl_t *agfl; /* a.g. free block array */ __be32 *blockp;/* pointer to array entry */ int error; int logflags; xfs_mount_t *mp; /* mount structure */ xfs_perag_t *pag; /* per allocation group data */ + __be32 *agfl_bno; + int startoff; agf = XFS_BUF_TO_AGF(agbp); mp = tp->t_mountp; @@ -2112,7 +2163,6 @@ xfs_alloc_put_freelist( if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), &agflbp))) return error; - agfl = XFS_BUF_TO_AGFL(agflbp); be32_add_cpu(&agf->agf_fllast, 1); if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) agf->agf_fllast = 0; @@ -2133,32 +2183,38 @@ xfs_alloc_put_freelist( xfs_alloc_log_agf(tp, agbp, logflags); ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); - blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)]; + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; *blockp = cpu_to_be32(bno); + startoff = (char *)blockp - (char *)agflbp->b_addr; + xfs_alloc_log_agf(tp, agbp, logflags); - xfs_trans_log_buf(tp, agflbp, - (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl), - (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl + - sizeof(xfs_agblock_t) - 1)); + + xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF); + xfs_trans_log_buf(tp, agflbp, startoff, + startoff + sizeof(xfs_agblock_t) - 1); return 0; } -static void +static bool xfs_agf_verify( + struct xfs_mount *mp, struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_agf *agf; - int agf_ok; + struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); - agf = XFS_BUF_TO_AGF(bp); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid)) + return false; - agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && - be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp); + if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && + XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && + be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) + return false; /* * during growfs operations, the perag is not fully initialised, @@ -2166,33 +2222,58 @@ xfs_agf_verify( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag) - agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) == - bp->b_pag->pag_agno; + if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) + return false; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) - agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= - be32_to_cpu(agf->agf_length); + if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) + return false; + + return true;; - if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, - XFS_RANDOM_ALLOC_READ_AGF))) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } } static void xfs_agf_read_verify( struct xfs_buf *bp) { - xfs_agf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + int agf_ok = 1; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + agf_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agf, agf_crc)); + + agf_ok = agf_ok && xfs_agf_verify(mp, bp); + + if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void xfs_agf_write_verify( struct xfs_buf *bp) { - xfs_agf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_agf_verify(mp, bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agf, agf_crc)); } const struct xfs_buf_ops xfs_agf_buf_ops = { diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index b1ddef6b2689..cafc90251d19 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -33,6 +33,7 @@ #include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" STATIC struct xfs_btree_cur * @@ -272,7 +273,7 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } -static void +static bool xfs_allocbt_verify( struct xfs_buf *bp) { @@ -280,66 +281,103 @@ xfs_allocbt_verify( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; unsigned int level; - int sblock_ok; /* block passes checks */ /* * magic number and level verification * - * During growfs operations, we can't verify the exact level as the - * perag is not fully initialised and hence not attached to the buffer. - * In this case, check against the maximum tree depth. + * During growfs operations, we can't verify the exact level or owner as + * the perag is not fully initialised and hence not attached to the + * buffer. In this case, check against the maximum tree depth. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agf information will not yet have been initialised + * from the on disk AGF. Again, we can only check against maximum limits + * in this case. */ level = be16_to_cpu(block->bb_level); switch (block->bb_magic) { + case cpu_to_be32(XFS_ABTB_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ case cpu_to_be32(XFS_ABTB_MAGIC): - if (pag) - sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi]; - else - sblock_ok = level < mp->m_ag_maxlevels; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; break; + case cpu_to_be32(XFS_ABTC_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ case cpu_to_be32(XFS_ABTC_MAGIC): - if (pag) - sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi]; - else - sblock_ok = level < mp->m_ag_maxlevels; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; break; default: - sblock_ok = 0; - break; + return false; } /* numrecs verification */ - sblock_ok = sblock_ok && - be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0]; + if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0]) + return false; /* sibling pointer verification */ - sblock_ok = sblock_ok && - (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_leftsib && - (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_rightsib; - - if (!sblock_ok) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; } static void xfs_allocbt_read_verify( struct xfs_buf *bp) { - xfs_allocbt_verify(bp); + if (!(xfs_btree_sblock_verify_crc(bp) && + xfs_allocbt_verify(bp))) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void xfs_allocbt_write_verify( struct xfs_buf *bp) { - xfs_allocbt_verify(bp); + if (!xfs_allocbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + xfs_btree_sblock_calc_crc(bp); + } const struct xfs_buf_ops xfs_allocbt_buf_ops = { @@ -348,7 +386,7 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = { }; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_allocbt_keys_inorder( struct xfs_btree_cur *cur, @@ -404,7 +442,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, .buf_ops = &xfs_allocbt_buf_ops, -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, #endif @@ -444,6 +482,9 @@ xfs_allocbt_init_cursor( cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + return cur; } diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index 7e89a2b429dd..e3a3f7424192 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -31,8 +31,10 @@ struct xfs_mount; * by blockcount and blockno. All blocks look the same to make the code * simpler; if we have time later, we'll make the optimizations. */ -#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ -#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ +#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ +#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */ +#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ +#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */ /* * Data record/key structure @@ -59,10 +61,10 @@ typedef __be32 xfs_alloc_ptr_t; /* * Btree block header size depends on a superblock flag. - * - * (not quite yet, but soon) */ -#define XFS_ALLOC_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN +#define XFS_ALLOC_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* * Record, key, and pointer address macros for btree blocks. diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5f707e537171..596ec71da00e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -31,6 +31,7 @@ #include "xfs_vnodeops.h" #include "xfs_trace.h" #include "xfs_bmap.h" +#include <linux/aio.h> #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/pagevec.h> @@ -724,6 +725,25 @@ xfs_convert_page( (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, i_size_read(inode)); + /* + * If the current map does not span the entire page we are about to try + * to write, then give up. The only way we can write a page that spans + * multiple mappings in a single writeback iteration is via the + * xfs_vm_writepage() function. Data integrity writeback requires the + * entire page to be written in a single attempt, otherwise the part of + * the page we don't write here doesn't get written as part of the data + * integrity sync. + * + * For normal writeback, we also don't attempt to write partial pages + * here as it simply means that write_cache_pages() will see it under + * writeback and ignore the page until some point in the future, at + * which time this will be the only page in the file that needs + * writeback. Hence for more optimal IO patterns, we should always + * avoid partial page writeback due to multiple mappings on a page here. + */ + if (!xfs_imap_valid(inode, imap, end_offset)) + goto fail_unlock_page; + len = 1 << inode->i_blkbits; p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), PAGE_CACHE_SIZE); @@ -823,10 +843,12 @@ xfs_cluster_write( STATIC void xfs_vm_invalidatepage( struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { - trace_xfs_invalidatepage(page->mapping->host, page, offset); - block_invalidatepage(page, offset); + trace_xfs_invalidatepage(page->mapping->host, page, offset, + length); + block_invalidatepage(page, offset, length); } /* @@ -890,7 +912,7 @@ next_buffer: xfs_iunlock(ip, XFS_ILOCK_EXCL); out_invalidate: - xfs_vm_invalidatepage(page, 0); + xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); return; } @@ -920,7 +942,7 @@ xfs_vm_writepage( int count = 0; int nonblocking = 0; - trace_xfs_writepage(inode, page, 0); + trace_xfs_writepage(inode, page, 0, 0); ASSERT(page_has_buffers(page)); @@ -953,13 +975,13 @@ xfs_vm_writepage( unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); /* - * Just skip the page if it is fully outside i_size, e.g. due - * to a truncate operation that is in progress. + * Skip the page if it is fully outside i_size, e.g. due to a + * truncate operation that is in progress. We must redirty the + * page so that reclaim stops reclaiming it. Otherwise + * xfs_vm_releasepage() is called on it and gets confused. */ - if (page->index >= end_index + 1 || offset_into_page == 0) { - unlock_page(page); - return 0; - } + if (page->index >= end_index + 1 || offset_into_page == 0) + goto redirty; /* * The page straddles i_size. It must be zeroed out on each @@ -1151,7 +1173,7 @@ xfs_vm_releasepage( { int delalloc, unwritten; - trace_xfs_releasepage(page->mapping->host, page, 0); + trace_xfs_releasepage(page->mapping->host, page, 0, 0); xfs_count_page_state(page, &delalloc, &unwritten); diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 888683844d98..20fe3fe9d341 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -15,7 +15,6 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ - #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" @@ -35,6 +34,7 @@ #include "xfs_bmap.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" @@ -74,13 +74,6 @@ STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); -/* - * Routines to manipulate out-of-line attribute values. - */ -STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args); -STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args); - -#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ STATIC int xfs_attr_name_to_xname( @@ -820,7 +813,7 @@ xfs_attr_inactive(xfs_inode_t *dp) error = 0; goto out; } - error = xfs_attr_root_inactive(&trans, dp); + error = xfs_attr3_root_inactive(&trans, dp); if (error) goto out; @@ -906,7 +899,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; @@ -914,14 +907,14 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Look up the given attribute in the leaf block. Figure out if * the given flags produce an error or call for an atomic rename. */ - retval = xfs_attr_leaf_lookup_int(bp, args); + retval = xfs_attr3_leaf_lookup_int(bp, args); if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { xfs_trans_brelse(args->trans, bp); - return(retval); + return retval; } else if (retval == EEXIST) { if (args->flags & ATTR_CREATE) { /* pure create op */ xfs_trans_brelse(args->trans, bp); - return(retval); + return retval; } trace_xfs_attr_leaf_replace(args); @@ -937,7 +930,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Add the attribute to the leaf block, transitioning to a Btree * if required. */ - retval = xfs_attr_leaf_add(bp, args); + retval = xfs_attr3_leaf_add(bp, args); if (retval == ENOSPC) { /* * Promote the attribute list to the Btree format, then @@ -945,7 +938,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * can manage its own transactions. */ xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_node(args); + error = xfs_attr3_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -1010,7 +1003,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. */ - error = xfs_attr_leaf_flipflags(args); + error = xfs_attr3_leaf_flipflags(args); if (error) return(error); @@ -1032,19 +1025,19 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Read in the block containing the "old" attr, then * remove the "old" attr from that block (neat, huh!) */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; - xfs_attr_leaf_remove(bp, args); + xfs_attr3_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, @@ -1076,9 +1069,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) /* * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr_leaf_clearflag(args); + error = xfs_attr3_leaf_clearflag(args); } - return(error); + return error; } /* @@ -1101,24 +1094,24 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; - error = xfs_attr_leaf_lookup_int(bp, args); + error = xfs_attr3_leaf_lookup_int(bp, args); if (error == ENOATTR) { xfs_trans_brelse(args->trans, bp); - return(error); + return error; } - xfs_attr_leaf_remove(bp, args); + xfs_attr3_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1128,7 +1121,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); - return(error); + return error; } /* @@ -1138,7 +1131,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) if (committed) xfs_trans_ijoin(args->trans, dp, 0); } - return(0); + return 0; } /* @@ -1156,21 +1149,21 @@ xfs_attr_leaf_get(xfs_da_args_t *args) trace_xfs_attr_leaf_get(args); args->blkno = 0; - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; - error = xfs_attr_leaf_lookup_int(bp, args); + error = xfs_attr3_leaf_lookup_int(bp, args); if (error != EEXIST) { xfs_trans_brelse(args->trans, bp); - return(error); + return error; } - error = xfs_attr_leaf_getvalue(bp, args); + error = xfs_attr3_leaf_getvalue(bp, args); xfs_trans_brelse(args->trans, bp); if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { error = xfs_attr_rmtval_get(args); } - return(error); + return error; } /* @@ -1185,11 +1178,11 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) trace_xfs_attr_leaf_list(context); context->cursor->blkno = 0; - error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp); + error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp); if (error) return XFS_ERROR(error); - error = xfs_attr_leaf_list_int(bp, context); + error = xfs_attr3_leaf_list_int(bp, context); xfs_trans_brelse(NULL, bp); return XFS_ERROR(error); } @@ -1236,7 +1229,7 @@ restart: * Search to see if name already exists, and get back a pointer * to where it should go. */ - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error) goto out; blk = &state->path.blk[ state->path.active-1 ]; @@ -1258,7 +1251,7 @@ restart: args->rmtblkcnt = 0; } - retval = xfs_attr_leaf_add(blk->bp, state->args); + retval = xfs_attr3_leaf_add(blk->bp, state->args); if (retval == ENOSPC) { if (state->path.active == 1) { /* @@ -1268,7 +1261,7 @@ restart: */ xfs_da_state_free(state); xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_node(args); + error = xfs_attr3_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1307,7 +1300,7 @@ restart: * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. */ xfs_bmap_init(args->flist, args->firstblock); - error = xfs_da_split(state); + error = xfs_da3_split(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -1329,7 +1322,7 @@ restart: /* * Addition succeeded, update Btree hashvals. */ - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); } /* @@ -1370,7 +1363,7 @@ restart: * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. */ - error = xfs_attr_leaf_flipflags(args); + error = xfs_attr3_leaf_flipflags(args); if (error) goto out; @@ -1400,7 +1393,7 @@ restart: state->blocksize = state->mp->m_sb.sb_blocksize; state->node_ents = state->mp->m_attr_node_ents; state->inleaf = 0; - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error) goto out; @@ -1409,15 +1402,15 @@ restart: */ blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr_leaf_remove(blk->bp, args); - xfs_da_fixhashpath(state, &state->path); + error = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); /* * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_da_join(state); + error = xfs_da3_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1450,7 +1443,7 @@ restart: /* * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr_leaf_clearflag(args); + error = xfs_attr3_leaf_clearflag(args); if (error) goto out; } @@ -1495,7 +1488,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error || (retval != EEXIST)) { if (error == 0) error = retval; @@ -1524,7 +1517,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * Mark the attribute as INCOMPLETE, then bunmapi() the * remote value. */ - error = xfs_attr_leaf_setflag(args); + error = xfs_attr3_leaf_setflag(args); if (error) goto out; error = xfs_attr_rmtval_remove(args); @@ -1545,15 +1538,15 @@ xfs_attr_node_removename(xfs_da_args_t *args) */ blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - retval = xfs_attr_leaf_remove(blk->bp, args); - xfs_da_fixhashpath(state, &state->path); + retval = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); /* * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_da_join(state); + error = xfs_da3_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -1591,13 +1584,13 @@ xfs_attr_node_removename(xfs_da_args_t *args) ASSERT(state->path.blk[0].bp); state->path.blk[0].bp = NULL; - error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp); if (error) goto out; if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, @@ -1699,7 +1692,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_node_read(state->args->trans, + error = xfs_da3_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); @@ -1718,7 +1711,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_node_read(state->args->trans, + error = xfs_da3_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); @@ -1758,7 +1751,7 @@ xfs_attr_node_get(xfs_da_args_t *args) /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error) { retval = error; } else if (retval == EEXIST) { @@ -1769,7 +1762,7 @@ xfs_attr_node_get(xfs_da_args_t *args) /* * Get the value, local or "remote" */ - retval = xfs_attr_leaf_getvalue(blk->bp, args); + retval = xfs_attr3_leaf_getvalue(blk->bp, args); if (!retval && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { retval = xfs_attr_rmtval_get(args); @@ -1794,7 +1787,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) attrlist_cursor_kern_t *cursor; xfs_attr_leafblock_t *leaf; xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; + struct xfs_attr3_icleaf_hdr leafhdr; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; int error, i; struct xfs_buf *bp; @@ -1810,27 +1805,33 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ bp = NULL; if (cursor->blkno > 0) { - error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1, + error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if ((error != 0) && (error != EFSCORRUPTED)) return(error); if (bp) { + struct xfs_attr_leaf_entry *entries; + node = bp->b_addr; switch (be16_to_cpu(node->hdr.info.magic)) { case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: trace_xfs_attr_list_wrong_blk(context); xfs_trans_brelse(NULL, bp); bp = NULL; break; case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: leaf = bp->b_addr; - if (cursor->hashval > be32_to_cpu(leaf->entries[ - be16_to_cpu(leaf->hdr.count)-1].hashval)) { + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + if (cursor->hashval > be32_to_cpu( + entries[leafhdr.count - 1].hashval)) { trace_xfs_attr_list_wrong_blk(context); xfs_trans_brelse(NULL, bp); bp = NULL; - } else if (cursor->hashval <= - be32_to_cpu(leaf->entries[0].hashval)) { + } else if (cursor->hashval <= be32_to_cpu( + entries[0].hashval)) { trace_xfs_attr_list_wrong_blk(context); xfs_trans_brelse(NULL, bp); bp = NULL; @@ -1852,27 +1853,31 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (bp == NULL) { cursor->blkno = 0; for (;;) { - error = xfs_da_node_read(NULL, context->dp, + __uint16_t magic; + + error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if (error) return(error); node = bp->b_addr; - if (node->hdr.info.magic == - cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) + magic = be16_to_cpu(node->hdr.info.magic); + if (magic == XFS_ATTR_LEAF_MAGIC || + magic == XFS_ATTR3_LEAF_MAGIC) break; - if (unlikely(node->hdr.info.magic != - cpu_to_be16(XFS_DA_NODE_MAGIC))) { + if (magic != XFS_DA_NODE_MAGIC && + magic != XFS_DA3_NODE_MAGIC) { XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", XFS_ERRLEVEL_LOW, context->dp->i_mount, node); xfs_trans_brelse(NULL, bp); - return(XFS_ERROR(EFSCORRUPTED)); + return XFS_ERROR(EFSCORRUPTED); } - btree = node->btree; - for (i = 0; i < be16_to_cpu(node->hdr.count); - btree++, i++) { + + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + for (i = 0; i < nodehdr.count; btree++, i++) { if (cursor->hashval <= be32_to_cpu(btree->hashval)) { cursor->blkno = be32_to_cpu(btree->before); @@ -1881,9 +1886,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) break; } } - if (i == be16_to_cpu(node->hdr.count)) { + if (i == nodehdr.count) { xfs_trans_brelse(NULL, bp); - return(0); + return 0; } xfs_trans_brelse(NULL, bp); } @@ -1897,310 +1902,21 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ for (;;) { leaf = bp->b_addr; - error = xfs_attr_leaf_list_int(bp, context); + error = xfs_attr3_leaf_list_int(bp, context); if (error) { xfs_trans_brelse(NULL, bp); return error; } - if (context->seen_enough || leaf->hdr.info.forw == 0) + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + if (context->seen_enough || leafhdr.forw == 0) break; - cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); + cursor->blkno = leafhdr.forw; xfs_trans_brelse(NULL, bp); - error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1, + error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1, &bp); if (error) return error; } xfs_trans_brelse(NULL, bp); - return(0); -} - - -/*======================================================================== - * External routines for manipulating out-of-line attribute values. - *========================================================================*/ - -/* - * Read the value associated with an attribute from the out-of-line buffer - * that we stored it in. - */ -int -xfs_attr_rmtval_get(xfs_da_args_t *args) -{ - xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; - xfs_mount_t *mp; - xfs_daddr_t dblkno; - void *dst; - xfs_buf_t *bp; - int nmap, error, tmp, valuelen, blkcnt, i; - xfs_dablk_t lblkno; - - trace_xfs_attr_rmtval_get(args); - - ASSERT(!(args->flags & ATTR_KERNOVAL)); - - mp = args->dp->i_mount; - dst = args->value; - valuelen = args->valuelen; - lblkno = args->rmtblkno; - while (valuelen > 0) { - nmap = ATTR_RMTVALUE_MAPSIZE; - error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, map, &nmap, - XFS_BMAPI_ATTRFORK); - if (error) - return(error); - ASSERT(nmap >= 1); - - for (i = 0; (i < nmap) && (valuelen > 0); i++) { - ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && - (map[i].br_startblock != HOLESTARTBLOCK)); - dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); - blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - dblkno, blkcnt, 0, &bp, NULL); - if (error) - return(error); - - tmp = min_t(int, valuelen, BBTOB(bp->b_length)); - xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ); - xfs_buf_relse(bp); - dst += tmp; - valuelen -= tmp; - - lblkno += map[i].br_blockcount; - } - } - ASSERT(valuelen == 0); - return(0); -} - -/* - * Write the value associated with an attribute into the out-of-line buffer - * that we have defined for it. - */ -STATIC int -xfs_attr_rmtval_set(xfs_da_args_t *args) -{ - xfs_mount_t *mp; - xfs_fileoff_t lfileoff; - xfs_inode_t *dp; - xfs_bmbt_irec_t map; - xfs_daddr_t dblkno; - void *src; - xfs_buf_t *bp; - xfs_dablk_t lblkno; - int blkcnt, valuelen, nmap, error, tmp, committed; - - trace_xfs_attr_rmtval_set(args); - - dp = args->dp; - mp = dp->i_mount; - src = args->value; - - /* - * Find a "hole" in the attribute address space large enough for - * us to drop the new attribute's value into. - */ - blkcnt = XFS_B_TO_FSB(mp, args->valuelen); - lfileoff = 0; - error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, - XFS_ATTR_FORK); - if (error) { - return(error); - } - args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; - args->rmtblkcnt = blkcnt; - - /* - * Roll through the "value", allocating blocks on disk as required. - */ - while (blkcnt > 0) { - /* - * Allocate a single extent, up to the size of the value. - */ - xfs_bmap_init(args->flist, args->firstblock); - nmap = 1; - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, - blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, args->total, &map, &nmap, - args->flist); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } - if (error) { - ASSERT(committed); - args->trans = NULL; - xfs_bmap_cancel(args->flist); - return(error); - } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - lblkno += map.br_blockcount; - blkcnt -= map.br_blockcount; - - /* - * Start the next trans in the chain. - */ - error = xfs_trans_roll(&args->trans, dp); - if (error) - return (error); - } - - /* - * Roll through the "value", copying the attribute value to the - * already-allocated blocks. Blocks are written synchronously - * so that we can know they are all on disk before we turn off - * the INCOMPLETE flag. - */ - lblkno = args->rmtblkno; - valuelen = args->valuelen; - while (valuelen > 0) { - int buflen; - - /* - * Try to remember where we decided to put the value. - */ - xfs_bmap_init(args->flist, args->firstblock); - nmap = 1; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, - XFS_BMAPI_ATTRFORK); - if (error) - return(error); - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); - if (!bp) - return ENOMEM; - - buflen = BBTOB(bp->b_length); - tmp = min_t(int, valuelen, buflen); - xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); - if (tmp < buflen) - xfs_buf_zero(bp, tmp, buflen - tmp); - - error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ - xfs_buf_relse(bp); - if (error) - return error; - src += tmp; - valuelen -= tmp; - - lblkno += map.br_blockcount; - } - ASSERT(valuelen == 0); - return(0); -} - -/* - * Remove the value associated with an attribute by deleting the - * out-of-line buffer that it is stored on. - */ -STATIC int -xfs_attr_rmtval_remove(xfs_da_args_t *args) -{ - xfs_mount_t *mp; - xfs_bmbt_irec_t map; - xfs_buf_t *bp; - xfs_daddr_t dblkno; - xfs_dablk_t lblkno; - int valuelen, blkcnt, nmap, error, done, committed; - - trace_xfs_attr_rmtval_remove(args); - - mp = args->dp->i_mount; - - /* - * Roll through the "value", invalidating the attribute value's - * blocks. - */ - lblkno = args->rmtblkno; - valuelen = args->rmtblkcnt; - while (valuelen > 0) { - /* - * Try to remember where we decided to put the value. - */ - nmap = 1; - error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, - XFS_BMAPI_ATTRFORK); - if (error) - return(error); - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - /* - * If the "remote" value is in the cache, remove it. - */ - bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); - if (bp) { - xfs_buf_stale(bp); - xfs_buf_relse(bp); - bp = NULL; - } - - valuelen -= map.br_blockcount; - - lblkno += map.br_blockcount; - } - - /* - * Keep de-allocating extents until the remote-value region is gone. - */ - lblkno = args->rmtblkno; - blkcnt = args->rmtblkcnt; - done = 0; - while (!done) { - xfs_bmap_init(args->flist, args->firstblock); - error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - 1, args->firstblock, args->flist, - &done); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } - if (error) { - ASSERT(committed); - args->trans = NULL; - xfs_bmap_cancel(args->flist); - return(error); - } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, args->dp, 0); - - /* - * Close out trans and start the next one in the chain. - */ - error = xfs_trans_roll(&args->trans, args->dp); - if (error) - return (error); - } - return(0); + return 0; } diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index e920d68ef509..de8dd58da46c 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -140,7 +140,6 @@ typedef struct xfs_attr_list_context { * Overall external interface routines. */ int xfs_attr_inactive(struct xfs_inode *dp); -int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_list_int(struct xfs_attr_list_context *); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index ee24993c7d12..31d3cd129269 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -31,6 +32,7 @@ #include "xfs_alloc.h" #include "xfs_btree.h" #include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -39,6 +41,9 @@ #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" + /* * xfs_attr_leaf.c @@ -53,85 +58,226 @@ /* * Routines used for growing the Btree. */ -STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block, - struct xfs_buf **bpp); -STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer, - xfs_da_args_t *args, int freemap_index); -STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args, - struct xfs_buf *leaf_buffer); -STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, +STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args, + xfs_dablk_t which_block, struct xfs_buf **bpp); +STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_da_args *args, int freemap_index); +STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_buf *leaf_buffer); +STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); -STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state, - xfs_da_state_blk_t *leaf_blk_1, - xfs_da_state_blk_t *leaf_blk_2, - int *number_entries_in_blk1, - int *number_usedbytes_in_blk1); +STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state, + xfs_da_state_blk_t *leaf_blk_1, + struct xfs_attr3_icleaf_hdr *ichdr1, + xfs_da_state_blk_t *leaf_blk_2, + struct xfs_attr3_icleaf_hdr *ichdr2, + int *number_entries_in_blk1, + int *number_usedbytes_in_blk1); /* * Routines used for shrinking the Btree. */ -STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, +STATIC int xfs_attr3_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, struct xfs_buf *bp, int level); -STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, +STATIC int xfs_attr3_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, struct xfs_buf *bp); -STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, +STATIC int xfs_attr3_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dablk_t blkno, int blkcnt); /* * Utility routines. */ -STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, - int src_start, - xfs_attr_leafblock_t *dst_leaf, - int dst_start, int move_count, - xfs_mount_t *mp); +STATIC void xfs_attr3_leaf_moveents(struct xfs_attr_leafblock *src_leaf, + struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start, + struct xfs_attr_leafblock *dst_leaf, + struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start, + int move_count, struct xfs_mount *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); -static void -xfs_attr_leaf_verify( +void +xfs_attr3_leaf_hdr_from_disk( + struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from) +{ + int i; + + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + + if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { + struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->count); + to->usedbytes = be16_to_cpu(hdr3->usedbytes); + to->firstused = be16_to_cpu(hdr3->firstused); + to->holes = hdr3->holes; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base); + to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size); + } + return; + } + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->usedbytes = be16_to_cpu(from->hdr.usedbytes); + to->firstused = be16_to_cpu(from->hdr.firstused); + to->holes = from->hdr.holes; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base); + to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size); + } +} + +void +xfs_attr3_leaf_hdr_to_disk( + struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from) +{ + int i; + + ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC || + from->magic == XFS_ATTR3_LEAF_MAGIC); + + if (from->magic == XFS_ATTR3_LEAF_MAGIC) { + struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to; + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->count = cpu_to_be16(from->count); + hdr3->usedbytes = cpu_to_be16(from->usedbytes); + hdr3->firstused = cpu_to_be16(from->firstused); + hdr3->holes = from->holes; + hdr3->pad1 = 0; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base); + hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size); + } + return; + } + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.usedbytes = cpu_to_be16(from->usedbytes); + to->hdr.firstused = cpu_to_be16(from->firstused); + to->hdr.holes = from->holes; + to->hdr.pad1 = 0; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base); + to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size); + } +} + +static bool +xfs_attr3_leaf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_attr_leaf_hdr *hdr = bp->b_addr; - int block_ok = 0; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr3_icleaf_hdr ichdr; - block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC); - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) + return false; + + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) + return false; + } else { + if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) + return false; } + if (ichdr.count == 0) + return false; + + /* XXX: need to range check rest of attr header values */ + /* XXX: hash order check? */ + + return true; } static void -xfs_attr_leaf_read_verify( +xfs_attr3_leaf_write_verify( struct xfs_buf *bp) { - xfs_attr_leaf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; + + if (!xfs_attr3_leaf_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_ATTR3_LEAF_CRC_OFF); } +/* + * leaf/node format detection on trees is sketchy, so a node read can be done on + * leaf level blocks when detection identifies the tree as a node format tree + * incorrectly. In this case, we need to swap the verifier to match the correct + * format of the block being read. + */ static void -xfs_attr_leaf_write_verify( - struct xfs_buf *bp) +xfs_attr3_leaf_read_verify( + struct xfs_buf *bp) { - xfs_attr_leaf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_ATTR3_LEAF_CRC_OFF)) || + !xfs_attr3_leaf_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } -const struct xfs_buf_ops xfs_attr_leaf_buf_ops = { - .verify_read = xfs_attr_leaf_read_verify, - .verify_write = xfs_attr_leaf_write_verify, +const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { + .verify_read = xfs_attr3_leaf_read_verify, + .verify_write = xfs_attr3_leaf_write_verify, }; int -xfs_attr_leaf_read( +xfs_attr3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); + return err; } /*======================================================================== @@ -172,7 +318,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) int dsize; xfs_mount_t *mp = dp->i_mount; - offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ + /* rounded down */ + offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; switch (dp->i_d.di_format) { case XFS_DINODE_FMT_DEV: @@ -231,7 +378,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) return 0; return dp->i_d.di_forkoff; } - dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); + dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot); break; } @@ -243,7 +390,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ - maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ if (offset >= maxforkoff) @@ -557,7 +705,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) } ASSERT(blkno == 0); - error = xfs_attr_leaf_create(args, blkno, &bp); + error = xfs_attr3_leaf_create(args, blkno, &bp); if (error) { error = xfs_da_shrink_inode(args, 0, bp); bp = NULL; @@ -586,9 +734,9 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) nargs.hashval = xfs_da_hashname(sfe->nameval, sfe->namelen); nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); - error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ + error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == ENOATTR); - error = xfs_attr_leaf_add(bp, &nargs); + error = xfs_attr3_leaf_add(bp, &nargs); ASSERT(error != ENOSPC); if (error) goto out; @@ -783,67 +931,74 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) */ int xfs_attr_shortform_allfit( - struct xfs_buf *bp, - struct xfs_inode *dp) + struct xfs_buf *bp, + struct xfs_inode *dp) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; xfs_attr_leaf_name_local_t *name_loc; - int bytes, i; + struct xfs_attr3_icleaf_hdr leafhdr; + int bytes; + int i; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); - entry = &leaf->entries[0]; bytes = sizeof(struct xfs_attr_sf_hdr); - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + for (i = 0; i < leafhdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!(entry->flags & XFS_ATTR_LOCAL)) return(0); - name_loc = xfs_attr_leaf_name_local(leaf, i); + name_loc = xfs_attr3_leaf_name_local(leaf, i); if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX) return(0); if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) return(0); - bytes += sizeof(struct xfs_attr_sf_entry)-1 + bytes += sizeof(struct xfs_attr_sf_entry) - 1 + name_loc->namelen + be16_to_cpu(name_loc->valuelen); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) - return(-1); - return(xfs_attr_shortform_bytesfit(dp, bytes)); + return -1; + return xfs_attr_shortform_bytesfit(dp, bytes); } /* * Convert a leaf attribute list to shortform attribute list */ int -xfs_attr_leaf_to_shortform( - struct xfs_buf *bp, - xfs_da_args_t *args, - int forkoff) +xfs_attr3_leaf_to_shortform( + struct xfs_buf *bp, + struct xfs_da_args *args, + int forkoff) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_da_args_t nargs; - xfs_inode_t *dp; - char *tmpbuffer; - int error, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_da_args nargs; + struct xfs_inode *dp = args->dp; + char *tmpbuffer; + int error; + int i; trace_xfs_attr_leaf_to_sf(args); - dp = args->dp; tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP); - ASSERT(tmpbuffer != NULL); + if (!tmpbuffer) + return ENOMEM; - ASSERT(bp != NULL); memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(dp->i_mount)); + leaf = (xfs_attr_leafblock_t *)tmpbuffer; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); + + /* XXX (dgc): buffer is about to be marked stale - why zero it? */ memset(bp->b_addr, 0, XFS_LBSIZE(dp->i_mount)); /* @@ -873,14 +1028,14 @@ xfs_attr_leaf_to_shortform( nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + + for (i = 0; i < ichdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!entry->nameidx) continue; ASSERT(entry->flags & XFS_ATTR_LOCAL); - name_loc = xfs_attr_leaf_name_local(leaf, i); + name_loc = xfs_attr3_leaf_name_local(leaf, i); nargs.name = name_loc->nameval; nargs.namelen = name_loc->namelen; nargs.value = &name_loc->nameval[nargs.namelen]; @@ -893,61 +1048,75 @@ xfs_attr_leaf_to_shortform( out: kmem_free(tmpbuffer); - return(error); + return error; } /* * Convert from using a single leaf to a root node and a leaf. */ int -xfs_attr_leaf_to_node(xfs_da_args_t *args) +xfs_attr3_leaf_to_node( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_da_intnode_t *node; - xfs_inode_t *dp; - struct xfs_buf *bp1, *bp2; - xfs_dablk_t blkno; - int error; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr icleafhdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr icnodehdr; + struct xfs_da_intnode *node; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp1 = NULL; + struct xfs_buf *bp2 = NULL; + xfs_dablk_t blkno; + int error; trace_xfs_attr_leaf_to_node(args); - dp = args->dp; - bp1 = bp2 = NULL; error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1); + error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1); if (error) goto out; - bp2 = NULL; - error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, - XFS_ATTR_FORK); + error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK); if (error) goto out; + + /* copy leaf to new buffer, update identifiers */ + xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); bp2->b_ops = bp1->b_ops; - memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); - bp1 = NULL; - xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); + memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(mp)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; + hdr3->blkno = cpu_to_be64(bp2->b_bn); + } + xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(mp) - 1); /* * Set up the new root node. */ - error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); + error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); if (error) goto out; node = bp1->b_addr; + xfs_da3_node_hdr_from_disk(&icnodehdr, node); + btree = xfs_da3_node_tree_p(node); + leaf = bp2->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + /* both on-disk, don't endian-flip twice */ - node->btree[0].hashval = - leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval; - node->btree[0].before = cpu_to_be32(blkno); - node->hdr.count = cpu_to_be16(1); - xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1); + btree[0].hashval = entries[icleafhdr.count - 1].hashval; + btree[0].before = cpu_to_be32(blkno); + icnodehdr.count = 1; + xfs_da3_node_hdr_to_disk(node, &icnodehdr); + xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(mp) - 1); error = 0; out: - return(error); + return error; } @@ -960,52 +1129,63 @@ out: * or a leaf in a node attribute list. */ STATIC int -xfs_attr_leaf_create( - xfs_da_args_t *args, - xfs_dablk_t blkno, - struct xfs_buf **bpp) +xfs_attr3_leaf_create( + struct xfs_da_args *args, + xfs_dablk_t blkno, + struct xfs_buf **bpp) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_inode_t *dp; - struct xfs_buf *bp; - int error; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; trace_xfs_attr_leaf_create(args); - dp = args->dp; - ASSERT(dp != NULL); error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp, XFS_ATTR_FORK); if (error) - return(error); - bp->b_ops = &xfs_attr_leaf_buf_ops; + return error; + bp->b_ops = &xfs_attr3_leaf_buf_ops; + xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF); leaf = bp->b_addr; - memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); - hdr = &leaf->hdr; - hdr->info.magic = cpu_to_be16(XFS_ATTR_LEAF_MAGIC); - hdr->firstused = cpu_to_be16(XFS_LBSIZE(dp->i_mount)); - if (!hdr->firstused) { - hdr->firstused = cpu_to_be16( - XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN); - } + memset(leaf, 0, XFS_LBSIZE(mp)); + + memset(&ichdr, 0, sizeof(ichdr)); + ichdr.firstused = XFS_LBSIZE(mp); - hdr->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) - - sizeof(xfs_attr_leaf_hdr_t)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_blkinfo *hdr3 = bp->b_addr; - xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); + ichdr.magic = XFS_ATTR3_LEAF_MAGIC; + + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + + ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); + } else { + ichdr.magic = XFS_ATTR_LEAF_MAGIC; + ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr); + } + ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; + + xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(mp) - 1); *bpp = bp; - return(0); + return 0; } /* * Split the leaf node, rebalance, then add the new entry. */ int -xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, - xfs_da_state_blk_t *newblk) +xfs_attr3_leaf_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk) { xfs_dablk_t blkno; int error; @@ -1019,7 +1199,7 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, error = xfs_da_grow_inode(state->args, &blkno); if (error) return(error); - error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp); + error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp); if (error) return(error); newblk->blkno = blkno; @@ -1029,8 +1209,8 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Rebalance the entries across the two leaves. * NOTE: rebalance() currently depends on the 2nd block being empty. */ - xfs_attr_leaf_rebalance(state, oldblk, newblk); - error = xfs_da_blk_link(state, oldblk, newblk); + xfs_attr3_leaf_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); if (error) return(error); @@ -1043,10 +1223,10 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, */ if (state->inleaf) { trace_xfs_attr_leaf_add_old(state->args); - error = xfs_attr_leaf_add(oldblk->bp, state->args); + error = xfs_attr3_leaf_add(oldblk->bp, state->args); } else { trace_xfs_attr_leaf_add_new(state->args); - error = xfs_attr_leaf_add(newblk->bp, state->args); + error = xfs_attr3_leaf_add(newblk->bp, state->args); } /* @@ -1061,22 +1241,23 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Add a name to the leaf attribute list structure. */ int -xfs_attr_leaf_add( +xfs_attr3_leaf_add( struct xfs_buf *bp, struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_attr_leaf_map_t *map; - int tablesize, entsize, sum, tmp, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + int tablesize; + int entsize; + int sum; + int tmp; + int i; trace_xfs_attr_leaf_add(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT((args->index >= 0) - && (args->index <= be16_to_cpu(leaf->hdr.count))); - hdr = &leaf->hdr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(args->index >= 0 && args->index <= ichdr.count); entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen, args->trans->t_mountp->m_sb.sb_blocksize, NULL); @@ -1084,25 +1265,23 @@ xfs_attr_leaf_add( * Search through freemap for first-fit on new name length. * (may need to figure in size of entry struct too) */ - tablesize = (be16_to_cpu(hdr->count) + 1) - * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1]; - for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) { - if (tablesize > be16_to_cpu(hdr->firstused)) { - sum += be16_to_cpu(map->size); + tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) { + if (tablesize > ichdr.firstused) { + sum += ichdr.freemap[i].size; continue; } - if (!map->size) + if (!ichdr.freemap[i].size) continue; /* no space in this map */ tmp = entsize; - if (be16_to_cpu(map->base) < be16_to_cpu(hdr->firstused)) + if (ichdr.freemap[i].base < ichdr.firstused) tmp += sizeof(xfs_attr_leaf_entry_t); - if (be16_to_cpu(map->size) >= tmp) { - tmp = xfs_attr_leaf_add_work(bp, args, i); - return(tmp); + if (ichdr.freemap[i].size >= tmp) { + tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i); + goto out_log_hdr; } - sum += be16_to_cpu(map->size); + sum += ichdr.freemap[i].size; } /* @@ -1110,82 +1289,89 @@ xfs_attr_leaf_add( * and we don't have enough freespace, then compaction will do us * no good and we should just give up. */ - if (!hdr->holes && (sum < entsize)) - return(XFS_ERROR(ENOSPC)); + if (!ichdr.holes && sum < entsize) + return XFS_ERROR(ENOSPC); /* * Compact the entries to coalesce free space. * This may change the hdr->count via dropping INCOMPLETE entries. */ - xfs_attr_leaf_compact(args, bp); + xfs_attr3_leaf_compact(args, &ichdr, bp); /* * After compaction, the block is guaranteed to have only one * free region, in freemap[0]. If it is not big enough, give up. */ - if (be16_to_cpu(hdr->freemap[0].size) - < (entsize + sizeof(xfs_attr_leaf_entry_t))) - return(XFS_ERROR(ENOSPC)); + if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) { + tmp = ENOSPC; + goto out_log_hdr; + } + + tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); - return(xfs_attr_leaf_add_work(bp, args, 0)); +out_log_hdr: + xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, &leaf->hdr, + xfs_attr3_leaf_hdr_size(leaf))); + return tmp; } /* * Add a name to a leaf attribute list structure. */ STATIC int -xfs_attr_leaf_add_work( - struct xfs_buf *bp, - xfs_da_args_t *args, - int mapindex) +xfs_attr3_leaf_add_work( + struct xfs_buf *bp, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_da_args *args, + int mapindex) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; - xfs_attr_leaf_map_t *map; - xfs_mount_t *mp; - int tmp, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_mount *mp; + int tmp; + int i; trace_xfs_attr_leaf_add_work(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - hdr = &leaf->hdr; - ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE)); - ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(hdr->count))); + ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE); + ASSERT(args->index >= 0 && args->index <= ichdr->count); /* * Force open some space in the entry array and fill it in. */ - entry = &leaf->entries[args->index]; - if (args->index < be16_to_cpu(hdr->count)) { - tmp = be16_to_cpu(hdr->count) - args->index; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + if (args->index < ichdr->count) { + tmp = ichdr->count - args->index; tmp *= sizeof(xfs_attr_leaf_entry_t); - memmove((char *)(entry+1), (char *)entry, tmp); + memmove(entry + 1, entry, tmp); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); } - be16_add_cpu(&hdr->count, 1); + ichdr->count++; /* * Allocate space for the new string (at the end of the run). */ - map = &hdr->freemap[mapindex]; mp = args->trans->t_mountp; - ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp)); - ASSERT((be16_to_cpu(map->base) & 0x3) == 0); - ASSERT(be16_to_cpu(map->size) >= + ASSERT(ichdr->freemap[mapindex].base < XFS_LBSIZE(mp)); + ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0); + ASSERT(ichdr->freemap[mapindex].size >= xfs_attr_leaf_newentsize(args->namelen, args->valuelen, mp->m_sb.sb_blocksize, NULL)); - ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp)); - ASSERT((be16_to_cpu(map->size) & 0x3) == 0); - be16_add_cpu(&map->size, - -xfs_attr_leaf_newentsize(args->namelen, args->valuelen, - mp->m_sb.sb_blocksize, &tmp)); - entry->nameidx = cpu_to_be16(be16_to_cpu(map->base) + - be16_to_cpu(map->size)); + ASSERT(ichdr->freemap[mapindex].size < XFS_LBSIZE(mp)); + ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0); + + ichdr->freemap[mapindex].size -= + xfs_attr_leaf_newentsize(args->namelen, args->valuelen, + mp->m_sb.sb_blocksize, &tmp); + + entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + + ichdr->freemap[mapindex].size); entry->hashval = cpu_to_be32(args->hashval); entry->flags = tmp ? XFS_ATTR_LOCAL : 0; entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); @@ -1200,7 +1386,7 @@ xfs_attr_leaf_add_work( XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); ASSERT((args->index == 0) || (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval))); - ASSERT((args->index == be16_to_cpu(hdr->count)-1) || + ASSERT((args->index == ichdr->count - 1) || (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval))); /* @@ -1211,14 +1397,14 @@ xfs_attr_leaf_add_work( * as part of this transaction (a split operation for example). */ if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); name_loc->namelen = args->namelen; name_loc->valuelen = cpu_to_be16(args->valuelen); memcpy((char *)name_loc->nameval, args->name, args->namelen); memcpy((char *)&name_loc->nameval[args->namelen], args->value, be16_to_cpu(name_loc->valuelen)); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->namelen = args->namelen; memcpy((char *)name_rmt->name, args->name, args->namelen); entry->flags |= XFS_ATTR_INCOMPLETE; @@ -1226,47 +1412,45 @@ xfs_attr_leaf_add_work( name_rmt->valuelen = 0; name_rmt->valueblk = 0; args->rmtblkno = 1; - args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); } xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), + XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), xfs_attr_leaf_entsize(leaf, args->index))); /* * Update the control info for this leaf node */ - if (be16_to_cpu(entry->nameidx) < be16_to_cpu(hdr->firstused)) { - /* both on-disk, don't endian-flip twice */ - hdr->firstused = entry->nameidx; - } - ASSERT(be16_to_cpu(hdr->firstused) >= - ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr))); - tmp = (be16_to_cpu(hdr->count)-1) * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - map = &hdr->freemap[0]; - for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { - if (be16_to_cpu(map->base) == tmp) { - be16_add_cpu(&map->base, sizeof(xfs_attr_leaf_entry_t)); - be16_add_cpu(&map->size, - -((int)sizeof(xfs_attr_leaf_entry_t))); + if (be16_to_cpu(entry->nameidx) < ichdr->firstused) + ichdr->firstused = be16_to_cpu(entry->nameidx); + + ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf)); + tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + if (ichdr->freemap[i].base == tmp) { + ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t); + ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t); } } - be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index)); - xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); - return(0); + ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index); + return 0; } /* * Garbage collect a leaf attribute list block by copying it to a new buffer. */ STATIC void -xfs_attr_leaf_compact( +xfs_attr3_leaf_compact( struct xfs_da_args *args, + struct xfs_attr3_icleaf_hdr *ichdr_dst, struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf_s, *leaf_d; - xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; + struct xfs_attr_leafblock *leaf_src; + struct xfs_attr_leafblock *leaf_dst; + struct xfs_attr3_icleaf_hdr ichdr_src; struct xfs_trans *trans = args->trans; struct xfs_mount *mp = trans->t_mountp; char *tmpbuffer; @@ -1274,43 +1458,87 @@ xfs_attr_leaf_compact( trace_xfs_attr_leaf_compact(args); tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); - ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); memset(bp->b_addr, 0, XFS_LBSIZE(mp)); + leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; + leaf_dst = bp->b_addr; /* - * Copy basic information + * Copy the on-disk header back into the destination buffer to ensure + * all the information in the header that is not part of the incore + * header structure is preserved. */ - leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; - leaf_d = bp->b_addr; - hdr_s = &leaf_s->hdr; - hdr_d = &leaf_d->hdr; - hdr_d->info = hdr_s->info; /* struct copy */ - hdr_d->firstused = cpu_to_be16(XFS_LBSIZE(mp)); - /* handle truncation gracefully */ - if (!hdr_d->firstused) { - hdr_d->firstused = cpu_to_be16( - XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN); - } - hdr_d->usedbytes = 0; - hdr_d->count = 0; - hdr_d->holes = 0; - hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) - - sizeof(xfs_attr_leaf_hdr_t)); + memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src)); + + /* Initialise the incore headers */ + ichdr_src = *ichdr_dst; /* struct copy */ + ichdr_dst->firstused = XFS_LBSIZE(mp); + ichdr_dst->usedbytes = 0; + ichdr_dst->count = 0; + ichdr_dst->holes = 0; + ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src); + ichdr_dst->freemap[0].size = ichdr_dst->firstused - + ichdr_dst->freemap[0].base; + + + /* write the header back to initialise the underlying buffer */ + xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); /* * Copy all entry's in the same (sorted) order, * but allocate name/value pairs packed and in sequence. */ - xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0, - be16_to_cpu(hdr_s->count), mp); + xfs_attr3_leaf_moveents(leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0, + ichdr_src.count, mp); + /* + * this logs the entire buffer, but the caller must write the header + * back to the buffer when it is finished modifying it. + */ xfs_trans_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); kmem_free(tmpbuffer); } /* + * Compare two leaf blocks "order". + * Return 0 unless leaf2 should go before leaf1. + */ +static int +xfs_attr3_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_attr3_icleaf_hdr *leaf1hdr, + struct xfs_buf *leaf2_bp, + struct xfs_attr3_icleaf_hdr *leaf2hdr) +{ + struct xfs_attr_leaf_entry *entries1; + struct xfs_attr_leaf_entry *entries2; + + entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr); + entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr); + if (leaf1hdr->count > 0 && leaf2hdr->count > 0 && + ((be32_to_cpu(entries2[0].hashval) < + be32_to_cpu(entries1[0].hashval)) || + (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) < + be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) { + return 1; + } + return 0; +} + +int +xfs_attr_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp) +{ + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + + xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr); + return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2); +} + +/* * Redistribute the attribute list entries between two leaf nodes, * taking into account the size of the new entry. * @@ -1323,14 +1551,23 @@ xfs_attr_leaf_compact( * the "new" and "old" values can end up in different blocks. */ STATIC void -xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2) +xfs_attr3_leaf_rebalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) { - xfs_da_args_t *args; - xfs_da_state_blk_t *tmp_blk; - xfs_attr_leafblock_t *leaf1, *leaf2; - xfs_attr_leaf_hdr_t *hdr1, *hdr2; - int count, totallen, max, space, swap; + struct xfs_da_args *args; + struct xfs_attr_leafblock *leaf1; + struct xfs_attr_leafblock *leaf2; + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + struct xfs_attr_leaf_entry *entries1; + struct xfs_attr_leaf_entry *entries2; + int count; + int totallen; + int max; + int space; + int swap; /* * Set up environment. @@ -1339,9 +1576,9 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; - ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(leaf2->hdr.count == 0); + xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); + xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + ASSERT(ichdr2.count == 0); args = state->args; trace_xfs_attr_leaf_rebalance(args); @@ -1353,16 +1590,23 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * second block, this code should never set "swap". */ swap = 0; - if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) { + if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) { + struct xfs_da_state_blk *tmp_blk; + struct xfs_attr3_icleaf_hdr tmp_ichdr; + tmp_blk = blk1; blk1 = blk2; blk2 = tmp_blk; + + /* struct copies to swap them rather than reconverting */ + tmp_ichdr = ichdr1; + ichdr1 = ichdr2; + ichdr2 = tmp_ichdr; + leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; swap = 1; } - hdr1 = &leaf1->hdr; - hdr2 = &leaf2->hdr; /* * Examine entries until we reduce the absolute difference in @@ -1372,41 +1616,39 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * "inleaf" is true if the new entry should be inserted into blk1. * If "swap" is also true, then reverse the sense of "inleaf". */ - state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2, - &count, &totallen); + state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1, + blk2, &ichdr2, + &count, &totallen); if (swap) state->inleaf = !state->inleaf; /* * Move any entries required from leaf to leaf: */ - if (count < be16_to_cpu(hdr1->count)) { + if (count < ichdr1.count) { /* * Figure the total bytes to be added to the destination leaf. */ /* number entries being moved */ - count = be16_to_cpu(hdr1->count) - count; - space = be16_to_cpu(hdr1->usedbytes) - totallen; + count = ichdr1.count - count; + space = ichdr1.usedbytes - totallen; space += count * sizeof(xfs_attr_leaf_entry_t); /* * leaf2 is the destination, compact it if it looks tight. */ - max = be16_to_cpu(hdr2->firstused) - - sizeof(xfs_attr_leaf_hdr_t); - max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t); + max = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1); + max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t); if (space > max) - xfs_attr_leaf_compact(args, blk2->bp); + xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp); /* * Move high entries from leaf1 to low end of leaf2. */ - xfs_attr_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count, - leaf2, 0, count, state->mp); + xfs_attr3_leaf_moveents(leaf1, &ichdr1, ichdr1.count - count, + leaf2, &ichdr2, 0, count, state->mp); - xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); - } else if (count > be16_to_cpu(hdr1->count)) { + } else if (count > ichdr1.count) { /* * I assert that since all callers pass in an empty * second buffer, this code should never execute. @@ -1417,36 +1659,37 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Figure the total bytes to be added to the destination leaf. */ /* number entries being moved */ - count -= be16_to_cpu(hdr1->count); - space = totallen - be16_to_cpu(hdr1->usedbytes); + count -= ichdr1.count; + space = totallen - ichdr1.usedbytes; space += count * sizeof(xfs_attr_leaf_entry_t); /* * leaf1 is the destination, compact it if it looks tight. */ - max = be16_to_cpu(hdr1->firstused) - - sizeof(xfs_attr_leaf_hdr_t); - max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t); + max = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1); + max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t); if (space > max) - xfs_attr_leaf_compact(args, blk1->bp); + xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp); /* * Move low entries from leaf2 to high end of leaf1. */ - xfs_attr_leaf_moveents(leaf2, 0, leaf1, - be16_to_cpu(hdr1->count), count, state->mp); - - xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + xfs_attr3_leaf_moveents(leaf2, &ichdr2, 0, leaf1, &ichdr1, + ichdr1.count, count, state->mp); } + xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1); + xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2); + xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + /* * Copy out last hashval in each block for B-tree code. */ - blk1->hashval = be32_to_cpu( - leaf1->entries[be16_to_cpu(leaf1->hdr.count)-1].hashval); - blk2->hashval = be32_to_cpu( - leaf2->entries[be16_to_cpu(leaf2->hdr.count)-1].hashval); + entries1 = xfs_attr3_leaf_entryp(leaf1); + entries2 = xfs_attr3_leaf_entryp(leaf2); + blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval); + blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval); /* * Adjust the expected index for insertion. @@ -1460,12 +1703,12 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * inserting. The index/blkno fields refer to the "old" entry, * while the index2/blkno2 fields refer to the "new" entry. */ - if (blk1->index > be16_to_cpu(leaf1->hdr.count)) { + if (blk1->index > ichdr1.count) { ASSERT(state->inleaf == 0); - blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); + blk2->index = blk1->index - ichdr1.count; args->index = args->index2 = blk2->index; args->blkno = args->blkno2 = blk2->blkno; - } else if (blk1->index == be16_to_cpu(leaf1->hdr.count)) { + } else if (blk1->index == ichdr1.count) { if (state->inleaf) { args->index = blk1->index; args->blkno = blk1->blkno; @@ -1477,8 +1720,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * is already stored in blkno2/index2, so don't * overwrite it overwise we corrupt the tree. */ - blk2->index = blk1->index - - be16_to_cpu(leaf1->hdr.count); + blk2->index = blk1->index - ichdr1.count; args->index = blk2->index; args->blkno = blk2->blkno; if (!state->extravalid) { @@ -1506,42 +1748,40 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * GROT: Do a double-split for this case? */ STATIC int -xfs_attr_leaf_figure_balance(xfs_da_state_t *state, - xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2, - int *countarg, int *usedbytesarg) +xfs_attr3_leaf_figure_balance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_attr3_icleaf_hdr *ichdr1, + struct xfs_da_state_blk *blk2, + struct xfs_attr3_icleaf_hdr *ichdr2, + int *countarg, + int *usedbytesarg) { - xfs_attr_leafblock_t *leaf1, *leaf2; - xfs_attr_leaf_hdr_t *hdr1, *hdr2; - xfs_attr_leaf_entry_t *entry; - int count, max, index, totallen, half; - int lastdelta, foundit, tmp; - - /* - * Set up environment. - */ - leaf1 = blk1->bp->b_addr; - leaf2 = blk2->bp->b_addr; - hdr1 = &leaf1->hdr; - hdr2 = &leaf2->hdr; - foundit = 0; - totallen = 0; + struct xfs_attr_leafblock *leaf1 = blk1->bp->b_addr; + struct xfs_attr_leafblock *leaf2 = blk2->bp->b_addr; + struct xfs_attr_leaf_entry *entry; + int count; + int max; + int index; + int totallen = 0; + int half; + int lastdelta; + int foundit = 0; + int tmp; /* * Examine entries until we reduce the absolute difference in * byte usage between the two blocks to a minimum. */ - max = be16_to_cpu(hdr1->count) + be16_to_cpu(hdr2->count); - half = (max+1) * sizeof(*entry); - half += be16_to_cpu(hdr1->usedbytes) + - be16_to_cpu(hdr2->usedbytes) + - xfs_attr_leaf_newentsize( - state->args->namelen, - state->args->valuelen, - state->blocksize, NULL); + max = ichdr1->count + ichdr2->count; + half = (max + 1) * sizeof(*entry); + half += ichdr1->usedbytes + ichdr2->usedbytes + + xfs_attr_leaf_newentsize(state->args->namelen, + state->args->valuelen, + state->blocksize, NULL); half /= 2; lastdelta = state->blocksize; - entry = &leaf1->entries[0]; + entry = xfs_attr3_leaf_entryp(leaf1); for (count = index = 0; count < max; entry++, index++, count++) { #define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A)) @@ -1564,9 +1804,9 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, /* * Wrap around into the second block if necessary. */ - if (count == be16_to_cpu(hdr1->count)) { + if (count == ichdr1->count) { leaf1 = leaf2; - entry = &leaf1->entries[0]; + entry = xfs_attr3_leaf_entryp(leaf1); index = 0; } @@ -1597,7 +1837,7 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, *countarg = count; *usedbytesarg = totallen; - return(foundit); + return foundit; } /*======================================================================== @@ -1616,14 +1856,20 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, * GROT: allow for INCOMPLETE entries in calculation. */ int -xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) +xfs_attr3_leaf_toosmall( + struct xfs_da_state *state, + int *action) { - xfs_attr_leafblock_t *leaf; - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *info; - int count, bytes, forward, error, retval, i; - xfs_dablk_t blkno; - struct xfs_buf *bp; + struct xfs_attr_leafblock *leaf; + struct xfs_da_state_blk *blk; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_buf *bp; + xfs_dablk_t blkno; + int bytes; + int forward; + int error; + int retval; + int i; trace_xfs_attr_leaf_toosmall(state->args); @@ -1633,13 +1879,11 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * to coalesce with a sibling. */ blk = &state->path.blk[ state->path.active-1 ]; - info = blk->bp->b_addr; - ASSERT(info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - leaf = (xfs_attr_leafblock_t *)info; - count = be16_to_cpu(leaf->hdr.count); - bytes = sizeof(xfs_attr_leaf_hdr_t) + - count * sizeof(xfs_attr_leaf_entry_t) + - be16_to_cpu(leaf->hdr.usedbytes); + leaf = blk->bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + bytes = xfs_attr3_leaf_hdr_size(leaf) + + ichdr.count * sizeof(xfs_attr_leaf_entry_t) + + ichdr.usedbytes; if (bytes > (state->blocksize >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return(0); @@ -1651,14 +1895,14 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * coalesce it with a sibling block. We choose (arbitrarily) * to merge with the forward block unless it is NULL. */ - if (count == 0) { + if (ichdr.count == 0) { /* * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ - forward = (info->forw != 0); + forward = (ichdr.forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); if (error) return(error); @@ -1667,7 +1911,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) } else { *action = 2; } - return(0); + return 0; } /* @@ -1678,28 +1922,28 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * to shrink an attribute list over time. */ /* start with smaller blk num */ - forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back)); + forward = ichdr.forw < ichdr.back; for (i = 0; i < 2; forward = !forward, i++) { + struct xfs_attr3_icleaf_hdr ichdr2; if (forward) - blkno = be32_to_cpu(info->forw); + blkno = ichdr.forw; else - blkno = be32_to_cpu(info->back); + blkno = ichdr.back; if (blkno == 0) continue; - error = xfs_attr_leaf_read(state->args->trans, state->args->dp, + error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, blkno, -1, &bp); if (error) return(error); - leaf = (xfs_attr_leafblock_t *)info; - count = be16_to_cpu(leaf->hdr.count); - bytes = state->blocksize - (state->blocksize>>2); - bytes -= be16_to_cpu(leaf->hdr.usedbytes); - leaf = bp->b_addr; - count += be16_to_cpu(leaf->hdr.count); - bytes -= be16_to_cpu(leaf->hdr.usedbytes); - bytes -= count * sizeof(xfs_attr_leaf_entry_t); - bytes -= sizeof(xfs_attr_leaf_hdr_t); + xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr); + + bytes = state->blocksize - (state->blocksize >> 2) - + ichdr.usedbytes - ichdr2.usedbytes - + ((ichdr.count + ichdr2.count) * + sizeof(xfs_attr_leaf_entry_t)) - + xfs_attr3_leaf_hdr_size(leaf); + xfs_trans_brelse(state->args->trans, bp); if (bytes >= 0) break; /* fits with at least 25% to spare */ @@ -1715,10 +1959,10 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) { - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); } else { - error = xfs_da_path_shift(state, &state->path, forward, + error = xfs_da3_path_shift(state, &state->path, forward, 0, &retval); } if (error) @@ -1738,32 +1982,35 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * If two leaves are 37% full, when combined they will leave 25% free. */ int -xfs_attr_leaf_remove( - struct xfs_buf *bp, - xfs_da_args_t *args) +xfs_attr3_leaf_remove( + struct xfs_buf *bp, + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_attr_leaf_map_t *map; - xfs_attr_leaf_entry_t *entry; - int before, after, smallest, entsize; - int tablesize, tmp, i; - xfs_mount_t *mp; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_mount *mp = args->trans->t_mountp; + int before; + int after; + int smallest; + int entsize; + int tablesize; + int tmp; + int i; trace_xfs_attr_leaf_remove(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - hdr = &leaf->hdr; - mp = args->trans->t_mountp; - ASSERT((be16_to_cpu(hdr->count) > 0) - && (be16_to_cpu(hdr->count) < (XFS_LBSIZE(mp)/8))); - ASSERT((args->index >= 0) - && (args->index < be16_to_cpu(hdr->count))); - ASSERT(be16_to_cpu(hdr->firstused) >= - ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr))); - entry = &leaf->entries[args->index]; - ASSERT(be16_to_cpu(entry->nameidx) >= be16_to_cpu(hdr->firstused)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + + ASSERT(ichdr.count > 0 && ichdr.count < XFS_LBSIZE(mp) / 8); + ASSERT(args->index >= 0 && args->index < ichdr.count); + ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) + + xfs_attr3_leaf_hdr_size(leaf)); + + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + + ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); /* @@ -1772,30 +2019,28 @@ xfs_attr_leaf_remove( * find smallest free region in case we need to replace it, * adjust any map that borders the entry table, */ - tablesize = be16_to_cpu(hdr->count) * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - map = &hdr->freemap[0]; - tmp = be16_to_cpu(map->size); + tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + tmp = ichdr.freemap[0].size; before = after = -1; smallest = XFS_ATTR_LEAF_MAPSIZE - 1; entsize = xfs_attr_leaf_entsize(leaf, args->index); - for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { - ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp)); - ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp)); - if (be16_to_cpu(map->base) == tablesize) { - be16_add_cpu(&map->base, - -((int)sizeof(xfs_attr_leaf_entry_t))); - be16_add_cpu(&map->size, sizeof(xfs_attr_leaf_entry_t)); + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + ASSERT(ichdr.freemap[i].base < XFS_LBSIZE(mp)); + ASSERT(ichdr.freemap[i].size < XFS_LBSIZE(mp)); + if (ichdr.freemap[i].base == tablesize) { + ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t); + ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t); } - if ((be16_to_cpu(map->base) + be16_to_cpu(map->size)) - == be16_to_cpu(entry->nameidx)) { + if (ichdr.freemap[i].base + ichdr.freemap[i].size == + be16_to_cpu(entry->nameidx)) { before = i; - } else if (be16_to_cpu(map->base) - == (be16_to_cpu(entry->nameidx) + entsize)) { + } else if (ichdr.freemap[i].base == + (be16_to_cpu(entry->nameidx) + entsize)) { after = i; - } else if (be16_to_cpu(map->size) < tmp) { - tmp = be16_to_cpu(map->size); + } else if (ichdr.freemap[i].size < tmp) { + tmp = ichdr.freemap[i].size; smallest = i; } } @@ -1806,36 +2051,30 @@ xfs_attr_leaf_remove( */ if ((before >= 0) || (after >= 0)) { if ((before >= 0) && (after >= 0)) { - map = &hdr->freemap[before]; - be16_add_cpu(&map->size, entsize); - be16_add_cpu(&map->size, - be16_to_cpu(hdr->freemap[after].size)); - hdr->freemap[after].base = 0; - hdr->freemap[after].size = 0; + ichdr.freemap[before].size += entsize; + ichdr.freemap[before].size += ichdr.freemap[after].size; + ichdr.freemap[after].base = 0; + ichdr.freemap[after].size = 0; } else if (before >= 0) { - map = &hdr->freemap[before]; - be16_add_cpu(&map->size, entsize); + ichdr.freemap[before].size += entsize; } else { - map = &hdr->freemap[after]; - /* both on-disk, don't endian flip twice */ - map->base = entry->nameidx; - be16_add_cpu(&map->size, entsize); + ichdr.freemap[after].base = be16_to_cpu(entry->nameidx); + ichdr.freemap[after].size += entsize; } } else { /* * Replace smallest region (if it is smaller than free'd entry) */ - map = &hdr->freemap[smallest]; - if (be16_to_cpu(map->size) < entsize) { - map->base = cpu_to_be16(be16_to_cpu(entry->nameidx)); - map->size = cpu_to_be16(entsize); + if (ichdr.freemap[smallest].size < entsize) { + ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx); + ichdr.freemap[smallest].size = entsize; } } /* * Did we remove the first entry? */ - if (be16_to_cpu(entry->nameidx) == be16_to_cpu(hdr->firstused)) + if (be16_to_cpu(entry->nameidx) == ichdr.firstused) smallest = 1; else smallest = 0; @@ -1843,20 +2082,20 @@ xfs_attr_leaf_remove( /* * Compress the remaining entries and zero out the removed stuff. */ - memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize); - be16_add_cpu(&hdr->usedbytes, -entsize); + memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize); + ichdr.usedbytes -= entsize; xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), + XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), entsize)); - tmp = (be16_to_cpu(hdr->count) - args->index) - * sizeof(xfs_attr_leaf_entry_t); - memmove((char *)entry, (char *)(entry+1), tmp); - be16_add_cpu(&hdr->count, -1); + tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t); + memmove(entry, entry + 1, tmp); + ichdr.count--; xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); - entry = &leaf->entries[be16_to_cpu(hdr->count)]; - memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t)); + XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t))); + + entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count]; + memset(entry, 0, sizeof(xfs_attr_leaf_entry_t)); /* * If we removed the first entry, re-find the first used byte @@ -1866,130 +2105,140 @@ xfs_attr_leaf_remove( */ if (smallest) { tmp = XFS_LBSIZE(mp); - entry = &leaf->entries[0]; - for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) { - ASSERT(be16_to_cpu(entry->nameidx) >= - be16_to_cpu(hdr->firstused)); + entry = xfs_attr3_leaf_entryp(leaf); + for (i = ichdr.count - 1; i >= 0; entry++, i--) { + ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); if (be16_to_cpu(entry->nameidx) < tmp) tmp = be16_to_cpu(entry->nameidx); } - hdr->firstused = cpu_to_be16(tmp); - if (!hdr->firstused) { - hdr->firstused = cpu_to_be16( - tmp - XFS_ATTR_LEAF_NAME_ALIGN); - } + ichdr.firstused = tmp; + if (!ichdr.firstused) + ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN; } else { - hdr->holes = 1; /* mark as needing compaction */ + ichdr.holes = 1; /* mark as needing compaction */ } + xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); + XFS_DA_LOGRANGE(leaf, &leaf->hdr, + xfs_attr3_leaf_hdr_size(leaf))); /* * Check if leaf is less than 50% full, caller may want to * "join" the leaf with a sibling if so. */ - tmp = sizeof(xfs_attr_leaf_hdr_t); - tmp += be16_to_cpu(leaf->hdr.count) * sizeof(xfs_attr_leaf_entry_t); - tmp += be16_to_cpu(leaf->hdr.usedbytes); - return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */ + tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) + + ichdr.count * sizeof(xfs_attr_leaf_entry_t); + + return tmp < mp->m_attr_magicpct; /* leaf is < 37% full */ } /* * Move all the attribute list entries from drop_leaf into save_leaf. */ void -xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk) +xfs_attr3_leaf_unbalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) { - xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf; - xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr; - xfs_mount_t *mp; - char *tmpbuffer; + struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr; + struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr; + struct xfs_attr3_icleaf_hdr drophdr; + struct xfs_attr3_icleaf_hdr savehdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_mount *mp = state->mp; trace_xfs_attr_leaf_unbalance(state->args); - /* - * Set up environment. - */ - mp = state->mp; - ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC); - ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC); drop_leaf = drop_blk->bp->b_addr; save_leaf = save_blk->bp->b_addr; - ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - drop_hdr = &drop_leaf->hdr; - save_hdr = &save_leaf->hdr; + xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf); + xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf); + entry = xfs_attr3_leaf_entryp(drop_leaf); /* * Save last hashval from dying block for later Btree fixup. */ - drop_blk->hashval = be32_to_cpu( - drop_leaf->entries[be16_to_cpu(drop_leaf->hdr.count)-1].hashval); + drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval); /* * Check if we need a temp buffer, or can we do it in place. * Note that we don't check "leaf" for holes because we will * always be dropping it, toosmall() decided that for us already. */ - if (save_hdr->holes == 0) { + if (savehdr.holes == 0) { /* * dest leaf has no holes, so we add there. May need * to make some room in the entry array. */ - if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { - xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0, - be16_to_cpu(drop_hdr->count), mp); + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, + drop_blk->bp, &drophdr)) { + xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + save_leaf, &savehdr, 0, + drophdr.count, mp); } else { - xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, - be16_to_cpu(save_hdr->count), - be16_to_cpu(drop_hdr->count), mp); + xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + save_leaf, &savehdr, + savehdr.count, drophdr.count, mp); } } else { /* * Destination has holes, so we make a temporary copy * of the leaf and add them both to that. */ - tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP); - ASSERT(tmpbuffer != NULL); - memset(tmpbuffer, 0, state->blocksize); - tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer; - tmp_hdr = &tmp_leaf->hdr; - tmp_hdr->info = save_hdr->info; /* struct copy */ - tmp_hdr->count = 0; - tmp_hdr->firstused = cpu_to_be16(state->blocksize); - if (!tmp_hdr->firstused) { - tmp_hdr->firstused = cpu_to_be16( - state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN); - } - tmp_hdr->usedbytes = 0; - if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { - xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0, - be16_to_cpu(drop_hdr->count), mp); - xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, - be16_to_cpu(tmp_leaf->hdr.count), - be16_to_cpu(save_hdr->count), mp); + struct xfs_attr_leafblock *tmp_leaf; + struct xfs_attr3_icleaf_hdr tmphdr; + + tmp_leaf = kmem_zalloc(state->blocksize, KM_SLEEP); + + /* + * Copy the header into the temp leaf so that all the stuff + * not in the incore header is present and gets copied back in + * once we've moved all the entries. + */ + memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf)); + + memset(&tmphdr, 0, sizeof(tmphdr)); + tmphdr.magic = savehdr.magic; + tmphdr.forw = savehdr.forw; + tmphdr.back = savehdr.back; + tmphdr.firstused = state->blocksize; + + /* write the header to the temp buffer to initialise it */ + xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr); + + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, + drop_blk->bp, &drophdr)) { + xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + tmp_leaf, &tmphdr, 0, + drophdr.count, mp); + xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0, + tmp_leaf, &tmphdr, tmphdr.count, + savehdr.count, mp); } else { - xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0, - be16_to_cpu(save_hdr->count), mp); - xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, - be16_to_cpu(tmp_leaf->hdr.count), - be16_to_cpu(drop_hdr->count), mp); + xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0, + tmp_leaf, &tmphdr, 0, + savehdr.count, mp); + xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + tmp_leaf, &tmphdr, tmphdr.count, + drophdr.count, mp); } - memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize); - kmem_free(tmpbuffer); + memcpy(save_leaf, tmp_leaf, state->blocksize); + savehdr = tmphdr; /* struct copy */ + kmem_free(tmp_leaf); } + xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr); xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, state->blocksize - 1); /* * Copy out last hashval in each block for B-tree code. */ - save_blk->hashval = be32_to_cpu( - save_leaf->entries[be16_to_cpu(save_leaf->hdr.count)-1].hashval); + entry = xfs_attr3_leaf_entryp(save_leaf); + save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval); } /*======================================================================== @@ -2010,31 +2259,33 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * Don't change the args->value unless we find the attribute. */ int -xfs_attr_leaf_lookup_int( - struct xfs_buf *bp, - xfs_da_args_t *args) +xfs_attr3_leaf_lookup_int( + struct xfs_buf *bp, + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; - int probe, span; - xfs_dahash_t hashval; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + xfs_dahash_t hashval; + int probe; + int span; trace_xfs_attr_leaf_lookup(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(be16_to_cpu(leaf->hdr.count) - < (XFS_LBSIZE(args->dp->i_mount)/8)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8); /* * Binary search. (note: small blocks will skip this loop) */ hashval = args->hashval; - probe = span = be16_to_cpu(leaf->hdr.count) / 2; - for (entry = &leaf->entries[probe]; span > 4; - entry = &leaf->entries[probe]) { + probe = span = ichdr.count / 2; + for (entry = &entries[probe]; span > 4; entry = &entries[probe]) { span /= 2; if (be32_to_cpu(entry->hashval) < hashval) probe += span; @@ -2043,35 +2294,31 @@ xfs_attr_leaf_lookup_int( else break; } - ASSERT((probe >= 0) && - (!leaf->hdr.count - || (probe < be16_to_cpu(leaf->hdr.count)))); - ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval)); + ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count)); + ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval); /* * Since we may have duplicate hashval's, find the first matching * hashval in the leaf. */ - while ((probe > 0) && (be32_to_cpu(entry->hashval) >= hashval)) { + while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) { entry--; probe--; } - while ((probe < be16_to_cpu(leaf->hdr.count)) && - (be32_to_cpu(entry->hashval) < hashval)) { + while (probe < ichdr.count && + be32_to_cpu(entry->hashval) < hashval) { entry++; probe++; } - if ((probe == be16_to_cpu(leaf->hdr.count)) || - (be32_to_cpu(entry->hashval) != hashval)) { + if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) { args->index = probe; - return(XFS_ERROR(ENOATTR)); + return XFS_ERROR(ENOATTR); } /* * Duplicate keys may be present, so search all of them for a match. */ - for ( ; (probe < be16_to_cpu(leaf->hdr.count)) && - (be32_to_cpu(entry->hashval) == hashval); + for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval); entry++, probe++) { /* * GROT: Add code to remove incomplete entries. @@ -2085,33 +2332,36 @@ xfs_attr_leaf_lookup_int( continue; } if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, probe); + name_loc = xfs_attr3_leaf_name_local(leaf, probe); if (name_loc->namelen != args->namelen) continue; - if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0) + if (memcmp(args->name, name_loc->nameval, + args->namelen) != 0) continue; if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; - return(XFS_ERROR(EEXIST)); + return XFS_ERROR(EEXIST); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, probe); + name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); if (name_rmt->namelen != args->namelen) continue; - if (memcmp(args->name, (char *)name_rmt->name, - args->namelen) != 0) + if (memcmp(args->name, name_rmt->name, + args->namelen) != 0) continue; if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; + args->valuelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); - return(XFS_ERROR(EEXIST)); + args->rmtblkcnt = xfs_attr3_rmt_blocks( + args->dp->i_mount, + args->valuelen); + return XFS_ERROR(EEXIST); } } args->index = probe; - return(XFS_ERROR(ENOATTR)); + return XFS_ERROR(ENOATTR); } /* @@ -2119,56 +2369,57 @@ xfs_attr_leaf_lookup_int( * list structure. */ int -xfs_attr_leaf_getvalue( - struct xfs_buf *bp, - xfs_da_args_t *args) +xfs_attr3_leaf_getvalue( + struct xfs_buf *bp, + struct xfs_da_args *args) { - int valuelen; - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + int valuelen; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(be16_to_cpu(leaf->hdr.count) - < (XFS_LBSIZE(args->dp->i_mount)/8)); - ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8); + ASSERT(args->index < ichdr.count); - entry = &leaf->entries[args->index]; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); ASSERT(name_loc->namelen == args->namelen); ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); valuelen = be16_to_cpu(name_loc->valuelen); if (args->flags & ATTR_KERNOVAL) { args->valuelen = valuelen; - return(0); + return 0; } if (args->valuelen < valuelen) { args->valuelen = valuelen; - return(XFS_ERROR(ERANGE)); + return XFS_ERROR(ERANGE); } args->valuelen = valuelen; memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); ASSERT(name_rmt->namelen == args->namelen); ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); valuelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, + valuelen); if (args->flags & ATTR_KERNOVAL) { args->valuelen = valuelen; - return(0); + return 0; } if (args->valuelen < valuelen) { args->valuelen = valuelen; - return(XFS_ERROR(ERANGE)); + return XFS_ERROR(ERANGE); } args->valuelen = valuelen; } - return(0); + return 0; } /*======================================================================== @@ -2181,13 +2432,21 @@ xfs_attr_leaf_getvalue( */ /*ARGSUSED*/ STATIC void -xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, - xfs_attr_leafblock_t *leaf_d, int start_d, - int count, xfs_mount_t *mp) +xfs_attr3_leaf_moveents( + struct xfs_attr_leafblock *leaf_s, + struct xfs_attr3_icleaf_hdr *ichdr_s, + int start_s, + struct xfs_attr_leafblock *leaf_d, + struct xfs_attr3_icleaf_hdr *ichdr_d, + int start_d, + int count, + struct xfs_mount *mp) { - xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; - xfs_attr_leaf_entry_t *entry_s, *entry_d; - int desti, tmp, i; + struct xfs_attr_leaf_entry *entry_s; + struct xfs_attr_leaf_entry *entry_d; + int desti; + int tmp; + int i; /* * Check for nothing to do. @@ -2198,45 +2457,41 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, /* * Set up environment. */ - ASSERT(leaf_s->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(leaf_d->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - hdr_s = &leaf_s->hdr; - hdr_d = &leaf_d->hdr; - ASSERT((be16_to_cpu(hdr_s->count) > 0) && - (be16_to_cpu(hdr_s->count) < (XFS_LBSIZE(mp)/8))); - ASSERT(be16_to_cpu(hdr_s->firstused) >= - ((be16_to_cpu(hdr_s->count) - * sizeof(*entry_s))+sizeof(*hdr_s))); - ASSERT(be16_to_cpu(hdr_d->count) < (XFS_LBSIZE(mp)/8)); - ASSERT(be16_to_cpu(hdr_d->firstused) >= - ((be16_to_cpu(hdr_d->count) - * sizeof(*entry_d))+sizeof(*hdr_d))); - - ASSERT(start_s < be16_to_cpu(hdr_s->count)); - ASSERT(start_d <= be16_to_cpu(hdr_d->count)); - ASSERT(count <= be16_to_cpu(hdr_s->count)); + ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC || + ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC); + ASSERT(ichdr_s->magic == ichdr_d->magic); + ASSERT(ichdr_s->count > 0 && ichdr_s->count < XFS_LBSIZE(mp) / 8); + ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s)) + + xfs_attr3_leaf_hdr_size(leaf_s)); + ASSERT(ichdr_d->count < XFS_LBSIZE(mp) / 8); + ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d)) + + xfs_attr3_leaf_hdr_size(leaf_d)); + + ASSERT(start_s < ichdr_s->count); + ASSERT(start_d <= ichdr_d->count); + ASSERT(count <= ichdr_s->count); + /* * Move the entries in the destination leaf up to make a hole? */ - if (start_d < be16_to_cpu(hdr_d->count)) { - tmp = be16_to_cpu(hdr_d->count) - start_d; + if (start_d < ichdr_d->count) { + tmp = ichdr_d->count - start_d; tmp *= sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_d->entries[start_d]; - entry_d = &leaf_d->entries[start_d + count]; - memmove((char *)entry_d, (char *)entry_s, tmp); + entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; + entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count]; + memmove(entry_d, entry_s, tmp); } /* * Copy all entry's in the same (sorted) order, * but allocate attribute info packed and in sequence. */ - entry_s = &leaf_s->entries[start_s]; - entry_d = &leaf_d->entries[start_d]; + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; desti = start_d; for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) { - ASSERT(be16_to_cpu(entry_s->nameidx) - >= be16_to_cpu(hdr_s->firstused)); + ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused); tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i); #ifdef GROT /* @@ -2245,36 +2500,34 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, * off for 6.2, should be revisited later. */ if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */ - memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp); - be16_add_cpu(&hdr_s->usedbytes, -tmp); - be16_add_cpu(&hdr_s->count, -1); + memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); + ichdr_s->usedbytes -= tmp; + ichdr_s->count -= 1; entry_d--; /* to compensate for ++ in loop hdr */ desti--; if ((start_s + i) < offset) result++; /* insertion index adjustment */ } else { #endif /* GROT */ - be16_add_cpu(&hdr_d->firstused, -tmp); + ichdr_d->firstused -= tmp; /* both on-disk, don't endian flip twice */ entry_d->hashval = entry_s->hashval; - /* both on-disk, don't endian flip twice */ - entry_d->nameidx = hdr_d->firstused; + entry_d->nameidx = cpu_to_be16(ichdr_d->firstused); entry_d->flags = entry_s->flags; ASSERT(be16_to_cpu(entry_d->nameidx) + tmp <= XFS_LBSIZE(mp)); - memmove(xfs_attr_leaf_name(leaf_d, desti), - xfs_attr_leaf_name(leaf_s, start_s + i), tmp); + memmove(xfs_attr3_leaf_name(leaf_d, desti), + xfs_attr3_leaf_name(leaf_s, start_s + i), tmp); ASSERT(be16_to_cpu(entry_s->nameidx) + tmp <= XFS_LBSIZE(mp)); - memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp); - be16_add_cpu(&hdr_s->usedbytes, -tmp); - be16_add_cpu(&hdr_d->usedbytes, tmp); - be16_add_cpu(&hdr_s->count, -1); - be16_add_cpu(&hdr_d->count, 1); - tmp = be16_to_cpu(hdr_d->count) - * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - ASSERT(be16_to_cpu(hdr_d->firstused) >= tmp); + memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); + ichdr_s->usedbytes -= tmp; + ichdr_d->usedbytes += tmp; + ichdr_s->count -= 1; + ichdr_d->count += 1; + tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf_d); + ASSERT(ichdr_d->firstused >= tmp); #ifdef GROT } #endif /* GROT */ @@ -2283,71 +2536,40 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, /* * Zero out the entries we just copied. */ - if (start_s == be16_to_cpu(hdr_s->count)) { + if (start_s == ichdr_s->count) { tmp = count * sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_s->entries[start_s]; + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; ASSERT(((char *)entry_s + tmp) <= ((char *)leaf_s + XFS_LBSIZE(mp))); - memset((char *)entry_s, 0, tmp); + memset(entry_s, 0, tmp); } else { /* * Move the remaining entries down to fill the hole, * then zero the entries at the top. */ - tmp = be16_to_cpu(hdr_s->count) - count; - tmp *= sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_s->entries[start_s + count]; - entry_d = &leaf_s->entries[start_s]; - memmove((char *)entry_d, (char *)entry_s, tmp); + tmp = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t); + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count]; + entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + memmove(entry_d, entry_s, tmp); tmp = count * sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_s->entries[be16_to_cpu(hdr_s->count)]; + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count]; ASSERT(((char *)entry_s + tmp) <= ((char *)leaf_s + XFS_LBSIZE(mp))); - memset((char *)entry_s, 0, tmp); + memset(entry_s, 0, tmp); } /* * Fill in the freemap information */ - hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - be16_add_cpu(&hdr_d->freemap[0].base, be16_to_cpu(hdr_d->count) * - sizeof(xfs_attr_leaf_entry_t)); - hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) - - be16_to_cpu(hdr_d->freemap[0].base)); - hdr_d->freemap[1].base = 0; - hdr_d->freemap[2].base = 0; - hdr_d->freemap[1].size = 0; - hdr_d->freemap[2].size = 0; - hdr_s->holes = 1; /* leaf may not be compact */ -} - -/* - * Compare two leaf blocks "order". - * Return 0 unless leaf2 should go before leaf1. - */ -int -xfs_attr_leaf_order( - struct xfs_buf *leaf1_bp, - struct xfs_buf *leaf2_bp) -{ - xfs_attr_leafblock_t *leaf1, *leaf2; - - leaf1 = leaf1_bp->b_addr; - leaf2 = leaf2_bp->b_addr; - ASSERT((leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) && - (leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC))); - if ((be16_to_cpu(leaf1->hdr.count) > 0) && - (be16_to_cpu(leaf2->hdr.count) > 0) && - ((be32_to_cpu(leaf2->entries[0].hashval) < - be32_to_cpu(leaf1->entries[0].hashval)) || - (be32_to_cpu(leaf2->entries[ - be16_to_cpu(leaf2->hdr.count)-1].hashval) < - be32_to_cpu(leaf1->entries[ - be16_to_cpu(leaf1->hdr.count)-1].hashval)))) { - return(1); - } - return(0); + ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d); + ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t); + ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; + ichdr_d->freemap[1].base = 0; + ichdr_d->freemap[2].base = 0; + ichdr_d->freemap[1].size = 0; + ichdr_d->freemap[2].size = 0; + ichdr_s->holes = 1; /* leaf may not be compact */ } /* @@ -2358,15 +2580,16 @@ xfs_attr_leaf_lasthash( struct xfs_buf *bp, int *count) { - xfs_attr_leafblock_t *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entries; - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr); + entries = xfs_attr3_leaf_entryp(bp->b_addr); if (count) - *count = be16_to_cpu(leaf->hdr.count); - if (!leaf->hdr.count) - return(0); - return be32_to_cpu(leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval); + *count = ichdr.count; + if (!ichdr.count) + return 0; + return be32_to_cpu(entries[ichdr.count - 1].hashval); } /* @@ -2376,20 +2599,21 @@ xfs_attr_leaf_lasthash( STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) { + struct xfs_attr_leaf_entry *entries; xfs_attr_leaf_name_local_t *name_loc; xfs_attr_leaf_name_remote_t *name_rmt; int size; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - if (leaf->entries[index].flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, index); + entries = xfs_attr3_leaf_entryp(leaf); + if (entries[index].flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf, index); size = xfs_attr_leaf_entsize_local(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, index); size = xfs_attr_leaf_entsize_remote(name_rmt->namelen); } - return(size); + return size; } /* @@ -2414,35 +2638,40 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local) *local = 0; } } - return(size); + return size; } /* * Copy out attribute list entries for attr_list(), for leaf attribute lists. */ int -xfs_attr_leaf_list_int( - struct xfs_buf *bp, - xfs_attr_list_context_t *context) +xfs_attr3_leaf_list_int( + struct xfs_buf *bp, + struct xfs_attr_list_context *context) { - attrlist_cursor_kern_t *cursor; - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - int retval, i; + struct attrlist_cursor_kern *cursor; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_entry *entry; + int retval; + int i; + + trace_xfs_attr_list_leaf(context); - ASSERT(bp != NULL); leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + cursor = context->cursor; cursor->initted = 1; - trace_xfs_attr_list_leaf(context); - /* * Re-find our place in the leaf block if this is a new syscall. */ if (context->resynch) { - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + entry = &entries[0]; + for (i = 0; i < ichdr.count; entry++, i++) { if (be32_to_cpu(entry->hashval) == cursor->hashval) { if (cursor->offset == context->dupcnt) { context->dupcnt = 0; @@ -2455,12 +2684,12 @@ xfs_attr_leaf_list_int( break; } } - if (i == be16_to_cpu(leaf->hdr.count)) { + if (i == ichdr.count) { trace_xfs_attr_list_notfound(context); - return(0); + return 0; } } else { - entry = &leaf->entries[0]; + entry = &entries[0]; i = 0; } context->resynch = 0; @@ -2469,7 +2698,7 @@ xfs_attr_leaf_list_int( * We have found our place, start copying out the new attributes. */ retval = 0; - for ( ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) { + for (; i < ichdr.count; entry++, i++) { if (be32_to_cpu(entry->hashval) != cursor->hashval) { cursor->hashval = be32_to_cpu(entry->hashval); cursor->offset = 0; @@ -2480,7 +2709,7 @@ xfs_attr_leaf_list_int( if (entry->flags & XFS_ATTR_LOCAL) { xfs_attr_leaf_name_local_t *name_loc = - xfs_attr_leaf_name_local(leaf, i); + xfs_attr3_leaf_name_local(leaf, i); retval = context->put_listent(context, entry->flags, @@ -2492,7 +2721,7 @@ xfs_attr_leaf_list_int( return retval; } else { xfs_attr_leaf_name_remote_t *name_rmt = - xfs_attr_leaf_name_remote(leaf, i); + xfs_attr3_leaf_name_remote(leaf, i); int valuelen = be32_to_cpu(name_rmt->valuelen); @@ -2505,7 +2734,8 @@ xfs_attr_leaf_list_int( args.valuelen = valuelen; args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); args.rmtblkno = be32_to_cpu(name_rmt->valueblk); - args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); + args.rmtblkcnt = xfs_attr3_rmt_blocks( + args.dp->i_mount, valuelen); retval = xfs_attr_rmtval_get(&args); if (retval) return retval; @@ -2532,7 +2762,7 @@ xfs_attr_leaf_list_int( cursor->offset++; } trace_xfs_attr_list_leaf_end(context); - return(retval); + return retval; } @@ -2544,14 +2774,16 @@ xfs_attr_leaf_list_int( * Clear the INCOMPLETE flag on an entry in a leaf block. */ int -xfs_attr_leaf_clearflag(xfs_da_args_t *args) +xfs_attr3_leaf_clearflag( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_remote_t *name_rmt; - struct xfs_buf *bp; - int error; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp; + int error; #ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr; xfs_attr_leaf_name_local_t *name_loc; int namelen; char *name; @@ -2561,23 +2793,25 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) /* * Set up the operation. */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return(error); leaf = bp->b_addr; - ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); - ASSERT(args->index >= 0); - entry = &leaf->entries[ args->index ]; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); #ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(args->index < ichdr.count); + ASSERT(args->index >= 0); + if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); namelen = name_loc->namelen; name = (char *)name_loc->nameval; } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); namelen = name_rmt->namelen; name = (char *)name_rmt->name; } @@ -2592,7 +2826,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) if (args->rmtblkno) { ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->valuelen); xfs_trans_log_buf(args->trans, bp, @@ -2609,34 +2843,41 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) * Set the INCOMPLETE flag on an entry in a leaf block. */ int -xfs_attr_leaf_setflag(xfs_da_args_t *args) +xfs_attr3_leaf_setflag( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_remote_t *name_rmt; - struct xfs_buf *bp; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp; int error; +#ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr; +#endif trace_xfs_attr_leaf_setflag(args); /* * Set up the operation. */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return(error); leaf = bp->b_addr; - ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); +#ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); - entry = &leaf->entries[ args->index ]; +#endif + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0); entry->flags |= XFS_ATTR_INCOMPLETE; xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if ((entry->flags & XFS_ATTR_LOCAL) == 0) { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = 0; name_rmt->valuelen = 0; xfs_trans_log_buf(args->trans, bp, @@ -2657,14 +2898,20 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) * Note that they could be in different blocks, or in the same block. */ int -xfs_attr_leaf_flipflags(xfs_da_args_t *args) +xfs_attr3_leaf_flipflags( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf1, *leaf2; - xfs_attr_leaf_entry_t *entry1, *entry2; - xfs_attr_leaf_name_remote_t *name_rmt; - struct xfs_buf *bp1, *bp2; + struct xfs_attr_leafblock *leaf1; + struct xfs_attr_leafblock *leaf2; + struct xfs_attr_leaf_entry *entry1; + struct xfs_attr_leaf_entry *entry2; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp1; + struct xfs_buf *bp2; int error; #ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; xfs_attr_leaf_name_local_t *name_loc; int namelen1, namelen2; char *name1, *name2; @@ -2675,7 +2922,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) /* * Read the block containing the "old" attr */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); if (error) return error; @@ -2683,7 +2930,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2, + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2, -1, &bp2); if (error) return error; @@ -2692,31 +2939,35 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) } leaf1 = bp1->b_addr; - ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); - ASSERT(args->index >= 0); - entry1 = &leaf1->entries[ args->index ]; + entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index]; leaf2 = bp2->b_addr; - ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); - ASSERT(args->index2 >= 0); - entry2 = &leaf2->entries[ args->index2 ]; + entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2]; #ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); + ASSERT(args->index < ichdr1.count); + ASSERT(args->index >= 0); + + xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + ASSERT(args->index2 < ichdr2.count); + ASSERT(args->index2 >= 0); + if (entry1->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf1, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf1, args->index); namelen1 = name_loc->namelen; name1 = (char *)name_loc->nameval; } else { - name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); namelen1 = name_rmt->namelen; name1 = (char *)name_rmt->name; } if (entry2->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf2, args->index2); + name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2); namelen2 = name_loc->namelen; name2 = (char *)name_loc->nameval; } else { - name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2); + name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); namelen2 = name_rmt->namelen; name2 = (char *)name_rmt->name; } @@ -2733,7 +2984,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); if (args->rmtblkno) { ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); - name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->valuelen); xfs_trans_log_buf(args->trans, bp1, @@ -2744,7 +2995,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { - name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2); + name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); name_rmt->valueblk = 0; name_rmt->valuelen = 0; xfs_trans_log_buf(args->trans, bp2, @@ -2756,7 +3007,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) */ error = xfs_trans_roll(&args->trans, args->dp); - return(error); + return error; } /*======================================================================== @@ -2768,12 +3019,14 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) * We're doing a depth-first traversal in order to invalidate everything. */ int -xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) +xfs_attr3_root_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp) { - xfs_da_blkinfo_t *info; - xfs_daddr_t blkno; - struct xfs_buf *bp; - int error; + struct xfs_da_blkinfo *info; + struct xfs_buf *bp; + xfs_daddr_t blkno; + int error; /* * Read block 0 to see what we have to work with. @@ -2781,40 +3034,46 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * the extents in reverse order the extent containing * block 0 must still be there. */ - error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) - return(error); - blkno = XFS_BUF_ADDR(bp); + return error; + blkno = bp->b_bn; /* * Invalidate the tree, even if the "tree" is only a single leaf block. * This is a depth-first traversal! */ info = bp->b_addr; - if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { - error = xfs_attr_node_inactive(trans, dp, bp, 1); - } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) { - error = xfs_attr_leaf_inactive(trans, dp, bp); - } else { + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, bp, 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, bp); + break; + default: error = XFS_ERROR(EIO); xfs_trans_brelse(*trans, bp); + break; } if (error) - return(error); + return error; /* * Invalidate the incore copy of the root block. */ error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); if (error) - return(error); + return error; xfs_trans_binval(*trans, bp); /* remove from cache */ /* * Commit the invalidate and start the next transaction. */ error = xfs_trans_roll(trans, dp); - return (error); + return error; } /* @@ -2822,7 +3081,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * We're doing a depth-first traversal in order to invalidate everything. */ STATIC int -xfs_attr_node_inactive( +xfs_attr3_node_inactive( struct xfs_trans **trans, struct xfs_inode *dp, struct xfs_buf *bp, @@ -2832,26 +3091,28 @@ xfs_attr_node_inactive( xfs_da_intnode_t *node; xfs_dablk_t child_fsb; xfs_daddr_t parent_blkno, child_blkno; - int error, count, i; + int error, i; struct xfs_buf *child_bp; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr ichdr; /* * Since this code is recursive (gasp!) we must protect ourselves. */ if (level > XFS_DA_NODE_MAXDEPTH) { xfs_trans_brelse(*trans, bp); /* no locks for later trans */ - return(XFS_ERROR(EIO)); + return XFS_ERROR(EIO); } node = bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - parent_blkno = XFS_BUF_ADDR(bp); /* save for re-read later */ - count = be16_to_cpu(node->hdr.count); - if (!count) { + xfs_da3_node_hdr_from_disk(&ichdr, node); + parent_blkno = bp->b_bn; + if (!ichdr.count) { xfs_trans_brelse(*trans, bp); - return(0); + return 0; } - child_fsb = be32_to_cpu(node->btree[0].before); + btree = xfs_da3_node_tree_p(node); + child_fsb = be32_to_cpu(btree[0].before); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ /* @@ -2859,14 +3120,14 @@ xfs_attr_node_inactive( * over the leaves removing all of them. If this is higher up * in the tree, recurse downward. */ - for (i = 0; i < count; i++) { + for (i = 0; i < ichdr.count; i++) { /* * Read the subsidiary block to see what we have to work with. * Don't do this in a transaction. This is a depth-first * traversal of the tree so we may deal with many blocks * before we come back to this one. */ - error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp, + error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp, XFS_ATTR_FORK); if (error) return(error); @@ -2878,18 +3139,24 @@ xfs_attr_node_inactive( * Invalidate the subtree, however we have to. */ info = child_bp->b_addr; - if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { - error = xfs_attr_node_inactive(trans, dp, - child_bp, level+1); - } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) { - error = xfs_attr_leaf_inactive(trans, dp, - child_bp); - } else { + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, + child_bp, level + 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, + child_bp); + break; + default: error = XFS_ERROR(EIO); xfs_trans_brelse(*trans, child_bp); + break; } if (error) - return(error); + return error; /* * Remove the subsidiary block from the cache @@ -2898,7 +3165,7 @@ xfs_attr_node_inactive( error = xfs_da_get_buf(*trans, dp, 0, child_blkno, &child_bp, XFS_ATTR_FORK); if (error) - return(error); + return error; xfs_trans_binval(*trans, child_bp); } @@ -2906,12 +3173,12 @@ xfs_attr_node_inactive( * If we're not done, re-read the parent to get the next * child block number. */ - if ((i+1) < count) { - error = xfs_da_node_read(*trans, dp, 0, parent_blkno, + if (i + 1 < ichdr.count) { + error = xfs_da3_node_read(*trans, dp, 0, parent_blkno, &bp, XFS_ATTR_FORK); if (error) - return(error); - child_fsb = be32_to_cpu(node->btree[i+1].before); + return error; + child_fsb = be32_to_cpu(btree[i + 1].before); xfs_trans_brelse(*trans, bp); } /* @@ -2919,10 +3186,10 @@ xfs_attr_node_inactive( */ error = xfs_trans_roll(trans, dp); if (error) - return (error); + return error; } - return(0); + return 0; } /* @@ -2932,29 +3199,35 @@ xfs_attr_node_inactive( * caught holding something that the logging code wants to flush to disk. */ STATIC int -xfs_attr_leaf_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp) +xfs_attr3_leaf_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_remote_t *name_rmt; - xfs_attr_inactive_list_t *list, *lp; - int error, count, size, tmp, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_attr_inactive_list *list; + struct xfs_attr_inactive_list *lp; + int error; + int count; + int size; + int tmp; + int i; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); /* * Count the number of "remote" value extents. */ count = 0; - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { if (be16_to_cpu(entry->nameidx) && ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr_leaf_name_remote(leaf, i); + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); if (name_rmt->valueblk) count++; } @@ -2965,27 +3238,27 @@ xfs_attr_leaf_inactive( */ if (count == 0) { xfs_trans_brelse(*trans, bp); - return(0); + return 0; } /* * Allocate storage for a list of all the "remote" value extents. */ size = count * sizeof(xfs_attr_inactive_list_t); - list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP); + list = kmem_alloc(size, KM_SLEEP); /* * Identify each of the "remote" value extents. */ lp = list; - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { if (be16_to_cpu(entry->nameidx) && ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr_leaf_name_remote(leaf, i); + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); if (name_rmt->valueblk) { lp->valueblk = be32_to_cpu(name_rmt->valueblk); - lp->valuelen = XFS_B_TO_FSB(dp->i_mount, + lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, be32_to_cpu(name_rmt->valuelen)); lp++; } @@ -2998,15 +3271,15 @@ xfs_attr_leaf_inactive( */ error = 0; for (lp = list, i = 0; i < count; i++, lp++) { - tmp = xfs_attr_leaf_freextent(trans, dp, + tmp = xfs_attr3_leaf_freextent(trans, dp, lp->valueblk, lp->valuelen); if (error == 0) error = tmp; /* save only the 1st errno */ } - kmem_free((xfs_caddr_t)list); - return(error); + kmem_free(list); + return error; } /* @@ -3014,14 +3287,20 @@ xfs_attr_leaf_inactive( * invalidate any buffers that are incore/in transactions. */ STATIC int -xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dablk_t blkno, int blkcnt) +xfs_attr3_leaf_freextent( + struct xfs_trans **trans, + struct xfs_inode *dp, + xfs_dablk_t blkno, + int blkcnt) { - xfs_bmbt_irec_t map; - xfs_dablk_t tblkno; - int tblkcnt, dblkcnt, nmap, error; - xfs_daddr_t dblkno; - xfs_buf_t *bp; + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_dablk_t tblkno; + xfs_daddr_t dblkno; + int tblkcnt; + int dblkcnt; + int nmap; + int error; /* * Roll through the "value", invalidating the attribute value's diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 77de139a58f0..444a7704596c 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -89,7 +90,7 @@ typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */ typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ __be32 hashval; /* hash value of name */ - __be16 nameidx; /* index into buffer of name/value */ + __be16 nameidx; /* index into buffer of name/value */ __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ __u8 pad2; /* unused pad byte */ } xfs_attr_leaf_entry_t; @@ -115,6 +116,55 @@ typedef struct xfs_attr_leafblock { } xfs_attr_leafblock_t; /* + * CRC enabled leaf structures. Called "version 3" structures to match the + * version number of the directory and dablk structures for this feature, and + * attr2 is already taken by the variable inode attribute fork size feature. + */ +struct xfs_attr3_leaf_hdr { + struct xfs_da3_blkinfo info; + __be16 count; + __be16 usedbytes; + __be16 firstused; + __u8 holes; + __u8 pad1; + struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE]; + __be32 pad2; /* 64 bit alignment */ +}; + +#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc)) + +struct xfs_attr3_leafblock { + struct xfs_attr3_leaf_hdr hdr; + struct xfs_attr_leaf_entry entries[1]; + + /* + * The rest of the block contains the following structures after the + * leaf entries, growing from the bottom up. The variables are never + * referenced, the locations accessed purely from helper functions. + * + * struct xfs_attr_leaf_name_local + * struct xfs_attr_leaf_name_remote + */ +}; + +/* + * incore, neutral version of the attribute leaf header + */ +struct xfs_attr3_icleaf_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t usedbytes; + __uint16_t firstused; + __u8 holes; + struct { + __uint16_t base; + __uint16_t size; + } freemap[XFS_ATTR_LEAF_MAPSIZE]; +}; + +/* * Flags used in the leaf_entry[i].flags field. * NOTE: the INCOMPLETE bit must not collide with the flags bits specified * on the system call, they are "or"ed together for various operations. @@ -147,26 +197,43 @@ typedef struct xfs_attr_leafblock { */ #define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t)) +static inline int +xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp) +{ + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return sizeof(struct xfs_attr3_leaf_hdr); + return sizeof(struct xfs_attr_leaf_hdr); +} + +static inline struct xfs_attr_leaf_entry * +xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp) +{ + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return &((struct xfs_attr3_leafblock *)leafp)->entries[0]; + return &leafp->entries[0]; +} + /* * Cast typed pointers for "local" and "remote" name/value structs. */ -static inline xfs_attr_leaf_name_remote_t * -xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) +static inline char * +xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx) { - return (xfs_attr_leaf_name_remote_t *) - &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; + struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp); + + return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)]; } -static inline xfs_attr_leaf_name_local_t * -xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) +static inline xfs_attr_leaf_name_remote_t * +xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) { - return (xfs_attr_leaf_name_local_t *) - &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; + return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx); } -static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx) +static inline xfs_attr_leaf_name_local_t * +xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) { - return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; + return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx); } /* @@ -221,37 +288,37 @@ int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); /* * Internal routines when attribute fork size == XFS_LBSIZE(mp). */ -int xfs_attr_leaf_to_node(struct xfs_da_args *args); -int xfs_attr_leaf_to_shortform(struct xfs_buf *bp, +int xfs_attr3_leaf_to_node(struct xfs_da_args *args); +int xfs_attr3_leaf_to_shortform(struct xfs_buf *bp, struct xfs_da_args *args, int forkoff); -int xfs_attr_leaf_clearflag(struct xfs_da_args *args); -int xfs_attr_leaf_setflag(struct xfs_da_args *args); -int xfs_attr_leaf_flipflags(xfs_da_args_t *args); +int xfs_attr3_leaf_clearflag(struct xfs_da_args *args); +int xfs_attr3_leaf_setflag(struct xfs_da_args *args); +int xfs_attr3_leaf_flipflags(struct xfs_da_args *args); /* * Routines used for growing the Btree. */ -int xfs_attr_leaf_split(struct xfs_da_state *state, +int xfs_attr3_leaf_split(struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); -int xfs_attr_leaf_lookup_int(struct xfs_buf *leaf, +int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf, struct xfs_da_args *args); -int xfs_attr_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); -int xfs_attr_leaf_add(struct xfs_buf *leaf_buffer, +int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); +int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_remove(struct xfs_buf *leaf_buffer, +int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_list_int(struct xfs_buf *bp, +int xfs_attr3_leaf_list_int(struct xfs_buf *bp, struct xfs_attr_list_context *context); /* * Routines used for shrinking the Btree. */ -int xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval); -void xfs_attr_leaf_unbalance(struct xfs_da_state *state, +int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval); +void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, struct xfs_da_state_blk *drop_blk, struct xfs_da_state_blk *save_blk); -int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); +int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); /* * Utility routines. @@ -261,10 +328,12 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local); -int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, +int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); +void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from); -extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops; +extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c new file mode 100644 index 000000000000..ef6b0c124528 --- /dev/null +++ b/fs/xfs/xfs_attr_remote.c @@ -0,0 +1,631 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" + +#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ + +/* + * Each contiguous block has a header, so it is not just a simple attribute + * length to FSB conversion. + */ +int +xfs_attr3_rmt_blocks( + struct xfs_mount *mp, + int attrlen) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + return (attrlen + buflen - 1) / buflen; + } + return XFS_B_TO_FSB(mp, attrlen); +} + +/* + * Checking of the remote attribute header is split into two parts. The verifier + * does CRC, location and bounds checking, the unpacking function checks the + * attribute parameters and owner. + */ +static bool +xfs_attr3_rmt_hdr_ok( + struct xfs_mount *mp, + void *ptr, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (bno != be64_to_cpu(rmt->rm_blkno)) + return false; + if (offset != be32_to_cpu(rmt->rm_offset)) + return false; + if (size != be32_to_cpu(rmt->rm_bytes)) + return false; + if (ino != be64_to_cpu(rmt->rm_owner)) + return false; + + /* ok */ + return true; +} + +static bool +xfs_attr3_rmt_verify( + struct xfs_mount *mp, + void *ptr, + int fsbsize, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) + return false; + if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(rmt->rm_blkno) != bno) + return false; + if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) + return false; + if (be32_to_cpu(rmt->rm_offset) + + be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX) + return false; + if (rmt->rm_owner == 0) + return false; + + return true; +} + +static void +xfs_attr3_rmt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + char *ptr; + int len; + bool corrupt = false; + xfs_daddr_t bno; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0) { + if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp), + XFS_ATTR3_RMT_CRC_OFF)) { + corrupt = true; + break; + } + if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { + corrupt = true; + break; + } + len -= XFS_LBSIZE(mp); + ptr += XFS_LBSIZE(mp); + bno += mp->m_bsize; + } + + if (corrupt) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } else + ASSERT(len == 0); +} + +static void +xfs_attr3_rmt_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + char *ptr; + int len; + xfs_daddr_t bno; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0) { + if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { + XFS_CORRUPTION_ERROR(__func__, + XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + if (bip) { + struct xfs_attr3_rmt_hdr *rmt; + + rmt = (struct xfs_attr3_rmt_hdr *)ptr; + rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF); + + len -= XFS_LBSIZE(mp); + ptr += XFS_LBSIZE(mp); + bno += mp->m_bsize; + } + ASSERT(len == 0); +} + +const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { + .verify_read = xfs_attr3_rmt_read_verify, + .verify_write = xfs_attr3_rmt_write_verify, +}; + +STATIC int +xfs_attr3_rmt_hdr_set( + struct xfs_mount *mp, + void *ptr, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return 0; + + rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC); + rmt->rm_offset = cpu_to_be32(offset); + rmt->rm_bytes = cpu_to_be32(size); + uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); + rmt->rm_owner = cpu_to_be64(ino); + rmt->rm_blkno = cpu_to_be64(bno); + + return sizeof(struct xfs_attr3_rmt_hdr); +} + +/* + * Helper functions to copy attribute data in and out of the one disk extents + */ +STATIC int +xfs_attr_rmtval_copyout( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + char **dst) +{ + char *src = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); + + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0 && *valuelen > 0) { + int hdr_size = 0; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + + byte_cnt = min_t(int, *valuelen, byte_cnt); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset, + byte_cnt, bno)) { + xfs_alert(mp, +"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", + bno, *offset, byte_cnt, ino); + return EFSCORRUPTED; + } + hdr_size = sizeof(struct xfs_attr3_rmt_hdr); + } + + memcpy(*dst, src + hdr_size, byte_cnt); + + /* roll buffer forwards */ + len -= XFS_LBSIZE(mp); + src += XFS_LBSIZE(mp); + bno += mp->m_bsize; + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *dst += byte_cnt; + *offset += byte_cnt; + } + return 0; +} + +STATIC void +xfs_attr_rmtval_copyin( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + char **src) +{ + char *dst = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); + + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0 && *valuelen > 0) { + int hdr_size; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + + byte_cnt = min(*valuelen, byte_cnt); + hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, + byte_cnt, bno); + + memcpy(dst + hdr_size, *src, byte_cnt); + + /* + * If this is the last block, zero the remainder of it. + * Check that we are actually the last block, too. + */ + if (byte_cnt + hdr_size < XFS_LBSIZE(mp)) { + ASSERT(*valuelen - byte_cnt == 0); + ASSERT(len == XFS_LBSIZE(mp)); + memset(dst + hdr_size + byte_cnt, 0, + XFS_LBSIZE(mp) - hdr_size - byte_cnt); + } + + /* roll buffer forwards */ + len -= XFS_LBSIZE(mp); + dst += XFS_LBSIZE(mp); + bno += mp->m_bsize; + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *src += byte_cnt; + *offset += byte_cnt; + } +} + +/* + * Read the value associated with an attribute from the out-of-line buffer + * that we stored it in. + */ +int +xfs_attr_rmtval_get( + struct xfs_da_args *args) +{ + struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE]; + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_buf *bp; + xfs_dablk_t lblkno = args->rmtblkno; + char *dst = args->value; + int valuelen = args->valuelen; + int nmap; + int error; + int blkcnt = args->rmtblkcnt; + int i; + int offset = 0; + + trace_xfs_attr_rmtval_get(args); + + ASSERT(!(args->flags & ATTR_KERNOVAL)); + + while (valuelen > 0) { + nmap = ATTR_RMTVALUE_MAPSIZE; + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + blkcnt, map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return error; + ASSERT(nmap >= 1); + + for (i = 0; (i < nmap) && (valuelen > 0); i++) { + xfs_daddr_t dblkno; + int dblkcnt; + + ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && + (map[i].br_startblock != HOLESTARTBLOCK)); + dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); + dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + dblkno, dblkcnt, 0, &bp, + &xfs_attr3_rmt_buf_ops); + if (error) + return error; + + error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, + &offset, &valuelen, + &dst); + xfs_buf_relse(bp); + if (error) + return error; + + /* roll attribute extent map forwards */ + lblkno += map[i].br_blockcount; + blkcnt -= map[i].br_blockcount; + } + } + ASSERT(valuelen == 0); + return 0; +} + +/* + * Write the value associated with an attribute into the out-of-line buffer + * that we have defined for it. + */ +int +xfs_attr_rmtval_set( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_bmbt_irec map; + xfs_dablk_t lblkno; + xfs_fileoff_t lfileoff = 0; + char *src = args->value; + int blkcnt; + int valuelen; + int nmap; + int error; + int offset = 0; + + trace_xfs_attr_rmtval_set(args); + + /* + * Find a "hole" in the attribute address space large enough for + * us to drop the new attribute's value into. Because CRC enable + * attributes have headers, we can't just do a straight byte to FSB + * conversion and have to take the header space into account. + */ + blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, + XFS_ATTR_FORK); + if (error) + return error; + + args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; + args->rmtblkcnt = blkcnt; + + /* + * Roll through the "value", allocating blocks on disk as required. + */ + while (blkcnt > 0) { + int committed; + + /* + * Allocate a single extent, up to the size of the value. + */ + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, + blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + args->firstblock, args->total, &map, &nmap, + args->flist); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + + /* + * Start the next trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + return (error); + } + + /* + * Roll through the "value", copying the attribute value to the + * already-allocated blocks. Blocks are written synchronously + * so that we can know they are all on disk before we turn off + * the INCOMPLETE flag. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + valuelen = args->valuelen; + while (valuelen > 0) { + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + + ASSERT(blkcnt > 0); + + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, + blkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return(error); + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0); + if (!bp) + return ENOMEM; + bp->b_ops = &xfs_attr3_rmt_buf_ops; + + xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, + &valuelen, &src); + + error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ + xfs_buf_relse(bp); + if (error) + return error; + + + /* roll attribute extent map forwards */ + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + } + ASSERT(valuelen == 0); + return 0; +} + +/* + * Remove the value associated with an attribute by deleting the + * out-of-line buffer that it is stored on. + */ +int +xfs_attr_rmtval_remove( + struct xfs_da_args *args) +{ + struct xfs_mount *mp = args->dp->i_mount; + xfs_dablk_t lblkno; + int blkcnt; + int error; + int done; + + trace_xfs_attr_rmtval_remove(args); + + /* + * Roll through the "value", invalidating the attribute value's blocks. + * Note that args->rmtblkcnt is the minimum number of data blocks we'll + * see for a CRC enabled remote attribute. Each extent will have a + * header, and so we may have more blocks than we realise here. If we + * fail to map the blocks correctly, we'll have problems with the buffer + * lookups. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + while (blkcnt > 0) { + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + int nmap; + + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); + if (error) + return(error); + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + /* + * If the "remote" value is in the cache, remove it. + */ + bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); + if (bp) { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + bp = NULL; + } + + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + } + + /* + * Keep de-allocating extents until the remote-value region is gone. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + done = 0; + while (!done) { + int committed; + + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + 1, args->firstblock, args->flist, + &done); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return error; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, args->dp, 0); + + /* + * Close out trans and start the next one in the chain. + */ + error = xfs_trans_roll(&args->trans, args->dp); + if (error) + return (error); + } + return(0); +} + diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h new file mode 100644 index 000000000000..92a8fd7977cc --- /dev/null +++ b/fs/xfs/xfs_attr_remote.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ATTR_REMOTE_H__ +#define __XFS_ATTR_REMOTE_H__ + +#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */ + +/* + * There is one of these headers per filesystem block in a remote attribute. + * This is done to ensure there is a 1:1 mapping between the attribute value + * length and the number of blocks needed to store the attribute. This makes the + * verification of a buffer a little more complex, but greatly simplifies the + * allocation, reading and writing of these attributes as we don't have to guess + * the number of blocks needed to store the attribute data. + */ +struct xfs_attr3_rmt_hdr { + __be32 rm_magic; + __be32 rm_offset; + __be32 rm_bytes; + __be32 rm_crc; + uuid_t rm_uuid; + __be64 rm_owner; + __be64 rm_blkno; + __be64 rm_lsn; +}; + +#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) + +#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_attr3_rmt_hdr) : 0)) + +extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; + +int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); + +int xfs_attr_rmtval_get(struct xfs_da_args *args); +int xfs_attr_rmtval_set(struct xfs_da_args *args); +int xfs_attr_rmtval_remove(struct xfs_da_args *args); + +#endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index b44af9211bd9..89042848f9ec 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -25,6 +25,7 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" +#include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" @@ -47,180 +48,78 @@ #include "xfs_filestream.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_symlink.h" kmem_zone_t *xfs_bmap_free_item_zone; /* - * Prototypes for internal bmap routines. - */ - -#ifdef DEBUG -STATIC void -xfs_bmap_check_leaf_extents( - struct xfs_btree_cur *cur, - struct xfs_inode *ip, - int whichfork); -#else -#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) -#endif - - -/* - * Called from xfs_bmap_add_attrfork to handle extents format files. - */ -STATIC int /* error */ -xfs_bmap_add_attrfork_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ - int *flags); /* inode logging flags */ - -/* - * Called from xfs_bmap_add_attrfork to handle local format files. + * Miscellaneous helper functions */ -STATIC int /* error */ -xfs_bmap_add_attrfork_local( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ - int *flags); /* inode logging flags */ /* - * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. - * It figures out where to ask the underlying allocator to put the new extent. - */ -STATIC int /* error */ -xfs_bmap_alloc( - xfs_bmalloca_t *ap); /* bmap alloc argument struct */ - -/* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. - */ -STATIC int /* error */ -xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ - int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ - -/* - * Remove the entry "free" from the free item list. Prev points to the - * previous entry, unless "free" is the head of the list. - */ -STATIC void -xfs_bmap_del_free( - xfs_bmap_free_t *flist, /* free item list header */ - xfs_bmap_free_item_t *prev, /* previous item on list, if any */ - xfs_bmap_free_item_t *free); /* list item to be freed */ - -/* - * Convert an extents-format file into a btree-format file. - * The new file will have a root block (in the inode) and a single child block. - */ -STATIC int /* error */ -xfs_bmap_extents_to_btree( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first-block-allocated */ - xfs_bmap_free_t *flist, /* blocks freed in xaction */ - xfs_btree_cur_t **curp, /* cursor returned to caller */ - int wasdel, /* converting a delayed alloc */ - int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ - -/* - * Convert a local file to an extents file. - * This code is sort of bogus, since the file data needs to get - * logged so it won't be lost. The bmap-level manipulations are ok, though. - */ -STATIC int /* error */ -xfs_bmap_local_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated in xaction */ - xfs_extlen_t total, /* total blocks needed by transaction */ - int *logflagsp, /* inode logging flags */ - int whichfork, /* data or attr fork */ - void (*init_fn)(struct xfs_buf *bp, - struct xfs_inode *ip, - struct xfs_ifork *ifp)); - -/* - * Search the extents list for the inode, for the extent containing bno. - * If bno lies in a hole, point to the next entry. If bno lies past eof, - * *eofp will be set, and *prevp will contain the last entry (null if none). - * Else, *lastxp will be set to the index of the found - * entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_extents( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int whichfork, /* data or attr fork */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */ - -/* - * Compute the worst-case number of indirect blocks that will be used - * for ip's delayed extent of length "len". - */ -STATIC xfs_filblks_t -xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len); /* delayed extent length */ - -#ifdef DEBUG -/* - * Perform various validation checks on the values being returned - * from xfs_bmapi(). + * Compute and fill in the value of the maximum depth of a bmap btree + * in this filesystem. Done once, during mount. */ -STATIC void -xfs_bmap_validate_ret( - xfs_fileoff_t bno, - xfs_filblks_t len, - int flags, - xfs_bmbt_irec_t *mval, - int nmap, - int ret_nmap); -#else -#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) -#endif /* DEBUG */ - -STATIC int -xfs_bmap_count_tree( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ifork_t *ifp, - xfs_fsblock_t blockno, - int levelin, - int *count); - -STATIC void -xfs_bmap_count_leaves( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - int numrecs, - int *count); +void +xfs_bmap_compute_maxlevels( + xfs_mount_t *mp, /* file system mount structure */ + int whichfork) /* data or attr fork */ +{ + int level; /* btree level */ + uint maxblocks; /* max blocks at this level */ + uint maxleafents; /* max leaf entries possible */ + int maxrootrecs; /* max records in root block */ + int minleafrecs; /* min records in leaf block */ + int minnoderecs; /* min records in node block */ + int sz; /* root block size */ -STATIC void -xfs_bmap_disk_count_leaves( - struct xfs_mount *mp, - struct xfs_btree_block *block, - int numrecs, - int *count); + /* + * The maximum number of extents in a file, hence the maximum + * number of leaf entries, is controlled by the type of di_nextents + * (a signed 32-bit number, xfs_extnum_t), or by di_anextents + * (a signed 16-bit number, xfs_aextnum_t). + * + * Note that we can no longer assume that if we are in ATTR1 that + * the fork offset of all the inodes will be + * (xfs_default_attroffset(ip) >> 3) because we could have mounted + * with ATTR2 and then mounted back with ATTR1, keeping the + * di_forkoff's fixed but probably at various positions. Therefore, + * for both ATTR1 and ATTR2 we have to assume the worst case scenario + * of a minimum size available. + */ + if (whichfork == XFS_DATA_FORK) { + maxleafents = MAXEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + } else { + maxleafents = MAXAEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); + } + maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0); + minleafrecs = mp->m_bmap_dmnr[0]; + minnoderecs = mp->m_bmap_dmnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) { + if (maxblocks <= maxrootrecs) + maxblocks = 1; + else + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + } + mp->m_bm_maxlevels[whichfork] = level; +} /* - * Bmap internal routines. + * Convert the given file system block to a disk block. We have to treat it + * differently based on whether the file is a real time file or not, because the + * bmap code does. */ +xfs_daddr_t +xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) +{ + return (XFS_IS_REALTIME_INODE(ip) ? \ + (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); +} STATIC int /* error */ xfs_bmbt_lookup_eq( @@ -290,6 +189,1070 @@ xfs_bmbt_update( } /* + * Compute the worst-case number of indirect blocks that will be used + * for ip's delayed extent of length "len". + */ +STATIC xfs_filblks_t +xfs_bmap_worst_indlen( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ +{ + int level; /* btree level number */ + int maxrecs; /* maximum record count at this level */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t rval; /* return value */ + + mp = ip->i_mount; + maxrecs = mp->m_bmap_dmxr[0]; + for (level = 0, rval = 0; + level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); + level++) { + len += maxrecs - 1; + do_div(len, maxrecs); + rval += len; + if (len == 1) + return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + level - 1; + if (level == 0) + maxrecs = mp->m_bmap_dmxr[1]; + } + return rval; +} + +/* + * Calculate the default attribute fork offset for newly created inodes. + */ +uint +xfs_default_attroffset( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + uint offset; + + if (mp->m_sb.sb_inodesize == 256) { + offset = XFS_LITINO(mp, ip->i_d.di_version) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); + } else { + offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); + } + + ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version)); + return offset; +} + +/* + * Helper routine to reset inode di_forkoff field when switching + * attribute fork from local to extent format - we reset it where + * possible to make space available for inline data fork extents. + */ +STATIC void +xfs_bmap_forkoff_reset( + xfs_mount_t *mp, + xfs_inode_t *ip, + int whichfork) +{ + if (whichfork == XFS_ATTR_FORK && + ip->i_d.di_format != XFS_DINODE_FMT_DEV && + ip->i_d.di_format != XFS_DINODE_FMT_UUID && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { + uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; + + if (dfl_forkoff > ip->i_d.di_forkoff) + ip->i_d.di_forkoff = dfl_forkoff; + } +} + +/* + * Extent tree block counting routines. + */ + +/* + * Count leaf blocks given a range of extent records. + */ +STATIC void +xfs_bmap_count_leaves( + xfs_ifork_t *ifp, + xfs_extnum_t idx, + int numrecs, + int *count) +{ + int b; + + for (b = 0; b < numrecs; b++) { + xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); + *count += xfs_bmbt_get_blockcount(frp); + } +} + +/* + * Count leaf blocks given a range of extent records originally + * in btree format. + */ +STATIC void +xfs_bmap_disk_count_leaves( + struct xfs_mount *mp, + struct xfs_btree_block *block, + int numrecs, + int *count) +{ + int b; + xfs_bmbt_rec_t *frp; + + for (b = 1; b <= numrecs; b++) { + frp = XFS_BMBT_REC_ADDR(mp, block, b); + *count += xfs_bmbt_disk_get_blockcount(frp); + } +} + +/* + * Recursively walks each level of a btree + * to count total fsblocks is use. + */ +STATIC int /* error */ +xfs_bmap_count_tree( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fsblock_t blockno, /* file system block number */ + int levelin, /* level in btree */ + int *count) /* Count of blocks */ +{ + int error; + xfs_buf_t *bp, *nbp; + int level = levelin; + __be64 *pp; + xfs_fsblock_t bno = blockno; + xfs_fsblock_t nextbno; + struct xfs_btree_block *block, *nextblock; + int numrecs; + + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + + if (--level) { + /* Not at node above leaves, count this level of nodes */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + while (nextbno != NULLFSBLOCK) { + error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + nextblock = XFS_BUF_TO_BLOCK(nbp); + nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); + xfs_trans_brelse(tp, nbp); + } + + /* Dive to the next level */ + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + if (unlikely((error = + xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { + xfs_trans_brelse(tp, bp); + XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + xfs_trans_brelse(tp, bp); + } else { + /* count all level 1 nodes and their leaves */ + for (;;) { + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + numrecs = be16_to_cpu(block->bb_numrecs); + xfs_bmap_disk_count_leaves(mp, block, numrecs, count); + xfs_trans_brelse(tp, bp); + if (nextbno == NULLFSBLOCK) + break; + bno = nextbno; + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + } + } + return 0; +} + +/* + * Count fsblocks of the given fork. + */ +int /* error */ +xfs_bmap_count_blocks( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + int *count) /* out: count of blocks */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { + xfs_bmap_count_leaves(ifp, 0, + ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), + count); + return 0; + } + + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + block = ifp->if_broot; + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { + XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, + mp); + return XFS_ERROR(EFSCORRUPTED); + } + + return 0; +} + +/* + * Debug/sanity checking code + */ + +STATIC int +xfs_bmap_sanity_check( + struct xfs_mount *mp, + struct xfs_buf *bp, + int level) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + + if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) && + block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC)) + return 0; + + if (be16_to_cpu(block->bb_level) != level || + be16_to_cpu(block->bb_numrecs) == 0 || + be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) + return 0; + + return 1; +} + +#ifdef DEBUG +STATIC struct xfs_buf * +xfs_bmap_get_bp( + struct xfs_btree_cur *cur, + xfs_fsblock_t bno) +{ + struct xfs_log_item_desc *lidp; + int i; + + if (!cur) + return NULL; + + for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { + if (!cur->bc_bufs[i]) + break; + if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) + return cur->bc_bufs[i]; + } + + /* Chase down all the log items to see if the bp is there */ + list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { + struct xfs_buf_log_item *bip; + bip = (struct xfs_buf_log_item *)lidp->lid_item; + if (bip->bli_item.li_type == XFS_LI_BUF && + XFS_BUF_ADDR(bip->bli_buf) == bno) + return bip->bli_buf; + } + + return NULL; +} + +STATIC void +xfs_check_block( + struct xfs_btree_block *block, + xfs_mount_t *mp, + int root, + short sz) +{ + int i, j, dmxr; + __be64 *pp, *thispa; /* pointer to block address */ + xfs_bmbt_key_t *prevp, *keyp; + + ASSERT(be16_to_cpu(block->bb_level) > 0); + + prevp = NULL; + for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { + dmxr = mp->m_bmap_dmxr[0]; + keyp = XFS_BMBT_KEY_ADDR(mp, block, i); + + if (prevp) { + ASSERT(be64_to_cpu(prevp->br_startoff) < + be64_to_cpu(keyp->br_startoff)); + } + prevp = keyp; + + /* + * Compare the block numbers to see if there are dups. + */ + if (root) + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); + else + pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); + + for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { + if (root) + thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); + else + thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); + if (*thispa == *pp) { + xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", + __func__, j, i, + (unsigned long long)be64_to_cpu(*thispa)); + panic("%s: ptrs are equal in node\n", + __func__); + } + } + } +} + +/* + * Check that the extents for the inode ip are in the right order in all + * btree leaves. + */ + +STATIC void +xfs_bmap_check_leaf_extents( + xfs_btree_cur_t *cur, /* btree cursor or null */ + xfs_inode_t *ip, /* incore inode pointer */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_extnum_t i=0, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + xfs_bmbt_rec_t *ep; /* pointer to current extent */ + xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ + xfs_bmbt_rec_t *nextp; /* pointer to next extent */ + int bp_release = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { + return; + } + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + xfs_check_block(block, mp, 1, ifp->if_broot_bytes); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + /* See if buf is in cur first */ + bp_release = 0; + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (!bp) { + bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; + } + block = XFS_BUF_TO_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, level), + error0); + if (level == 0) + break; + + /* + * Check this block for basic sanity (increasing keys and + * no duplicate blocks). + */ + + xfs_check_block(block, mp, 0, 0); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + } + + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + i = 0; + + /* + * Loop over all leaf nodes checking that all extents are in the right order. + */ + for (;;) { + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + + + num_recs = xfs_btree_get_numrecs(block); + + /* + * Read-ahead the next leaf block, if any. + */ + + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + + /* + * Check all the extents to make sure they are OK. + * If we had a previous block, the last entry should + * conform with the first entry in this one. + */ + + ep = XFS_BMBT_REC_ADDR(mp, block, 1); + if (i) { + ASSERT(xfs_bmbt_disk_get_startoff(&last) + + xfs_bmbt_disk_get_blockcount(&last) <= + xfs_bmbt_disk_get_startoff(ep)); + } + for (j = 1; j < num_recs; j++) { + nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); + ASSERT(xfs_bmbt_disk_get_startoff(ep) + + xfs_bmbt_disk_get_blockcount(ep) <= + xfs_bmbt_disk_get_startoff(nextp)); + ep = nextp; + } + + last = *ep; + i += num_recs; + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + + bp_release = 0; + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (!bp) { + bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; + } + block = XFS_BUF_TO_BLOCK(bp); + } + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + return; + +error0: + xfs_warn(mp, "%s: at error0", __func__); + if (bp_release) + xfs_trans_brelse(NULL, bp); +error_norelse: + xfs_warn(mp, "%s: BAD after btree leaves for %d extents", + __func__, i); + panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); + return; +} + +/* + * Add bmap trace insert entries for all the contents of the extent records. + */ +void +xfs_bmap_trace_exlist( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t cnt, /* count of entries in the list */ + int whichfork, /* data or attr fork */ + unsigned long caller_ip) +{ + xfs_extnum_t idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int state = 0; + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + for (idx = 0; idx < cnt; idx++) + trace_xfs_extlist(ip, idx, whichfork, caller_ip); +} + +/* + * Validate that the bmbt_irecs being returned from bmapi are valid + * given the callers original parameters. Specifically check the + * ranges of the returned irecs to ensure that they only extent beyond + * the given parameters if the XFS_BMAPI_ENTIRE flag was set. + */ +STATIC void +xfs_bmap_validate_ret( + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_bmbt_irec_t *mval, + int nmap, + int ret_nmap) +{ + int i; /* index to map values */ + + ASSERT(ret_nmap <= nmap); + + for (i = 0; i < ret_nmap; i++) { + ASSERT(mval[i].br_blockcount > 0); + if (!(flags & XFS_BMAPI_ENTIRE)) { + ASSERT(mval[i].br_startoff >= bno); + ASSERT(mval[i].br_blockcount <= len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= + bno + len); + } else { + ASSERT(mval[i].br_startoff < bno + len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount > + bno); + } + ASSERT(i == 0 || + mval[i - 1].br_startoff + mval[i - 1].br_blockcount == + mval[i].br_startoff); + ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && + mval[i].br_startblock != HOLESTARTBLOCK); + ASSERT(mval[i].br_state == XFS_EXT_NORM || + mval[i].br_state == XFS_EXT_UNWRITTEN); + } +} + +#else +#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) +#endif /* DEBUG */ + +/* + * bmap free list manipulation functions + */ + +/* + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). + */ +void +xfs_bmap_add_free( + xfs_fsblock_t bno, /* fs block number of extent */ + xfs_filblks_t len, /* length of extent */ + xfs_bmap_free_t *flist, /* list of extents */ + xfs_mount_t *mp) /* mount point structure */ +{ + xfs_bmap_free_item_t *cur; /* current (next) element */ + xfs_bmap_free_item_t *new; /* new element */ + xfs_bmap_free_item_t *prev; /* previous element */ +#ifdef DEBUG + xfs_agnumber_t agno; + xfs_agblock_t agbno; + + ASSERT(bno != NULLFSBLOCK); + ASSERT(len > 0); + ASSERT(len <= MAXEXTLEN); + ASSERT(!isnullstartblock(bno)); + agno = XFS_FSB_TO_AGNO(mp, bno); + agbno = XFS_FSB_TO_AGBNO(mp, bno); + ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(agbno < mp->m_sb.sb_agblocks); + ASSERT(len < mp->m_sb.sb_agblocks); + ASSERT(agbno + len <= mp->m_sb.sb_agblocks); +#endif + ASSERT(xfs_bmap_free_item_zone != NULL); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new->xbfi_startblock = bno; + new->xbfi_blockcount = (xfs_extlen_t)len; + for (prev = NULL, cur = flist->xbf_first; + cur != NULL; + prev = cur, cur = cur->xbfi_next) { + if (cur->xbfi_startblock >= bno) + break; + } + if (prev) + prev->xbfi_next = new; + else + flist->xbf_first = new; + new->xbfi_next = cur; + flist->xbf_count++; +} + +/* + * Remove the entry "free" from the free item list. Prev points to the + * previous entry, unless "free" is the head of the list. + */ +STATIC void +xfs_bmap_del_free( + xfs_bmap_free_t *flist, /* free item list header */ + xfs_bmap_free_item_t *prev, /* previous item on list, if any */ + xfs_bmap_free_item_t *free) /* list item to be freed */ +{ + if (prev) + prev->xbfi_next = free->xbfi_next; + else + flist->xbf_first = free->xbfi_next; + flist->xbf_count--; + kmem_zone_free(xfs_bmap_free_item_zone, free); +} + + +/* + * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi + * caller. Frees all the extents that need freeing, which must be done + * last due to locking considerations. We never free any extents in + * the first transaction. + * + * Return 1 if the given transaction was committed and a new one + * started, and 0 otherwise in the committed parameter. + */ +int /* error */ +xfs_bmap_finish( + xfs_trans_t **tp, /* transaction pointer addr */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *committed) /* xact committed or not */ +{ + xfs_efd_log_item_t *efd; /* extent free data */ + xfs_efi_log_item_t *efi; /* extent free intention */ + int error; /* error return value */ + xfs_bmap_free_item_t *free; /* free extent item */ + unsigned int logres; /* new log reservation */ + unsigned int logcount; /* new log count */ + xfs_mount_t *mp; /* filesystem mount structure */ + xfs_bmap_free_item_t *next; /* next item on free list */ + xfs_trans_t *ntp; /* new transaction pointer */ + + ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); + if (flist->xbf_count == 0) { + *committed = 0; + return 0; + } + ntp = *tp; + efi = xfs_trans_get_efi(ntp, flist->xbf_count); + for (free = flist->xbf_first; free; free = free->xbfi_next) + xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + free->xbfi_blockcount); + logres = ntp->t_log_res; + logcount = ntp->t_log_count; + ntp = xfs_trans_dup(*tp); + error = xfs_trans_commit(*tp, 0); + *tp = ntp; + *committed = 1; + /* + * We have a new transaction, so we should return committed=1, + * even though we're returning an error. + */ + if (error) + return error; + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(ntp->t_ticket); + + if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES, + logcount))) + return error; + efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + for (free = flist->xbf_first; free != NULL; free = next) { + next = free->xbfi_next; + if ((error = xfs_free_extent(ntp, free->xbfi_startblock, + free->xbfi_blockcount))) { + /* + * The bmap free list will be cleaned up at a + * higher level. The EFI will be canceled when + * this transaction is aborted. + * Need to force shutdown here to make sure it + * happens, since this transaction may not be + * dirty yet. + */ + mp = ntp->t_mountp; + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_force_shutdown(mp, + (error == EFSCORRUPTED) ? + SHUTDOWN_CORRUPT_INCORE : + SHUTDOWN_META_IO_ERROR); + return error; + } + xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, + free->xbfi_blockcount); + xfs_bmap_del_free(flist, NULL, free); + } + return 0; +} + +/* + * Free up any items left in the list. + */ +void +xfs_bmap_cancel( + xfs_bmap_free_t *flist) /* list of bmap_free_items */ +{ + xfs_bmap_free_item_t *free; /* free list item */ + xfs_bmap_free_item_t *next; + + if (flist->xbf_count == 0) + return; + ASSERT(flist->xbf_first != NULL); + for (free = flist->xbf_first; free; free = next) { + next = free->xbfi_next; + xfs_bmap_del_free(flist, NULL, free); + } + ASSERT(flist->xbf_count == 0); +} + +/* + * Inode fork format manipulation functions + */ + +/* + * Transform a btree format file with only one leaf node, where the + * extents list will fit in the inode, into an extents format file. + * Since the file extents are already in-core, all we have to do is + * give up the space for the btree root and pitch the leaf block. + */ +STATIC int /* error */ +xfs_bmap_btree_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_btree_cur_t *cur, /* btree cursor */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + /* REFERENCED */ + struct xfs_btree_block *cblock;/* child btree block */ + xfs_fsblock_t cbno; /* child block number */ + xfs_buf_t *cbp; /* child block's buffer */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork data */ + xfs_mount_t *mp; /* mount point structure */ + __be64 *pp; /* ptr to block address */ + struct xfs_btree_block *rblock;/* root btree block */ + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + rblock = ifp->if_broot; + ASSERT(be16_to_cpu(rblock->bb_level) == 1); + ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); + ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); + cbno = be64_to_cpu(*pp); + *logflagsp = 0; +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, cbno, 1))) + return error; +#endif + error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + cblock = XFS_BUF_TO_BLOCK(cbp); + if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) + return error; + xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); + ip->i_d.di_nblocks--; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(tp, cbp); + if (cur->bc_bufs[0] == cbp) + cur->bc_bufs[0] = NULL; + xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + return 0; +} + +/* + * Convert an extents-format file into a btree-format file. + * The new file will have a root block (in the inode) and a single child block. + */ +STATIC int /* error */ +xfs_bmap_extents_to_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first-block-allocated */ + xfs_bmap_free_t *flist, /* blocks freed in xaction */ + xfs_btree_cur_t **curp, /* cursor returned to caller */ + int wasdel, /* converting a delayed alloc */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *ablock; /* allocated (child) bt block */ + xfs_buf_t *abp; /* buffer for ablock */ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_bmbt_rec_t *arp; /* child record pointer */ + struct xfs_btree_block *block; /* btree root block */ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + int error; /* error return value */ + xfs_extnum_t i, cnt; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_key_t *kp; /* root block key pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_bmbt_ptr_t *pp; /* root block address pointer */ + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); + + /* + * Make space in the inode incore. + */ + xfs_iroot_realloc(ip, 1, whichfork); + ifp->if_flags |= XFS_IFBROOT; + + /* + * Fill in the root. + */ + block = ifp->if_broot; + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS); + + /* + * Need a cursor. Can't allocate until bb_level is filled in. + */ + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + /* + * Convert to a btree with two levels, one record in root. + */ + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = mp; + args.firstblock = *firstblock; + if (*firstblock == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); + } else if (flist->xbf_low) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = *firstblock; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.fsbno = *firstblock; + } + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = wasdel; + *logflagsp = 0; + if ((error = xfs_alloc_vextent(&args))) { + xfs_iroot_realloc(ip, -1, whichfork); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + /* + * Allocation can't fail, the space was reserved. + */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(*firstblock == NULLFSBLOCK || + args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || + (flist->xbf_low && + args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); + *firstblock = cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + ip->i_d.di_nblocks++; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); + abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); + /* + * Fill in the child block. + */ + abp->b_ops = &xfs_bmbt_buf_ops; + ablock = XFS_BUF_TO_BLOCK(abp); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); + + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (cnt = i = 0; i < nextents; i++) { + ep = xfs_iext_get_ext(ifp, i); + if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { + arp->l0 = cpu_to_be64(ep->l0); + arp->l1 = cpu_to_be64(ep->l1); + arp++; cnt++; + } + } + ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); + xfs_btree_set_numrecs(ablock, cnt); + + /* + * Fill in the root key and pointer. + */ + kp = XFS_BMBT_KEY_ADDR(mp, block, 1); + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, + be16_to_cpu(block->bb_level))); + *pp = cpu_to_be64(args.fsbno); + + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS); + xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); + ASSERT(*curp == NULL); + *curp = cur; + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); + return 0; +} + +/* + * Convert a local file to an extents file. + * This code is out of bounds for data forks of regular files, + * since the file data needs to get logged so things will stay consistent. + * (The bmap-level manipulations are ok, though). + */ +STATIC int /* error */ +xfs_bmap_local_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated in xaction */ + xfs_extlen_t total, /* total blocks needed by transaction */ + int *logflagsp, /* inode logging flags */ + int whichfork, + void (*init_fn)(struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp)) +{ + int error; /* error return value */ + int flags; /* logging flags returned */ + xfs_ifork_t *ifp; /* inode fork pointer */ + + /* + * We don't want to deal with the case of keeping inode data inline yet. + * So sending the data fork of a regular inode is invalid. + */ + ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + flags = 0; + error = 0; + if (ifp->if_bytes) { + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_buf_t *bp; /* buffer for extent block */ + xfs_bmbt_rec_host_t *ep;/* extent record pointer */ + + ASSERT((ifp->if_flags & + (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = ip->i_mount; + args.firstblock = *firstblock; + /* + * Allocate a block. We know we need only one, since the + * file currently fits in an inode. + */ + if (*firstblock == NULLFSBLOCK) { + args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); + args.type = XFS_ALLOCTYPE_START_BNO; + } else { + args.fsbno = *firstblock; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + } + args.total = total; + args.minlen = args.maxlen = args.prod = 1; + error = xfs_alloc_vextent(&args); + if (error) + goto done; + + /* Can't fail, the space was reserved. */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(args.len == 1); + *firstblock = args.fsbno; + bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); + + /* initialise the block and copy the data */ + init_fn(tp, bp, ip, ifp); + + /* account for the change in fork size and log everything */ + xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); + xfs_bmap_forkoff_reset(args.mp, ip, whichfork); + xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); + xfs_iext_add(ifp, 0, 1); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); + trace_xfs_bmap_post_update(ip, 0, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, + _THIS_IP_); + XFS_IFORK_NEXT_SET(ip, whichfork, 1); + ip->i_d.di_nblocks = 1; + xfs_trans_mod_dquot_byino(tp, ip, + XFS_TRANS_DQ_BCOUNT, 1L); + flags |= xfs_ilog_fext(whichfork); + } else { + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); + xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); + } + ifp->if_flags &= ~XFS_IFINLINE; + ifp->if_flags |= XFS_IFEXTENTS; + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + flags |= XFS_ILOG_CORE; +done: + *logflagsp = flags; + return error; +} + +/* * Called from xfs_bmap_add_attrfork to handle btree format files. */ STATIC int /* error */ @@ -360,29 +1323,22 @@ xfs_bmap_add_attrfork_extents( } /* - * Block initialisation functions for local to extent format conversion. - * As these get more complex, they will be moved to the relevant files, - * but for now they are too simple to worry about. + * Block initialisation function for local to extent format conversion. + * + * This shouldn't actually be called by anyone, so make sure debug kernels cause + * a noticable failure. */ STATIC void xfs_bmap_local_to_extents_init_fn( + struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp) { + ASSERT(0); bp->b_ops = &xfs_bmbt_buf_ops; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); -} - -STATIC void -xfs_symlink_local_to_remote( - struct xfs_buf *bp, - struct xfs_inode *ip, - struct xfs_ifork *ifp) -{ - /* remote symlink blocks are not verifiable until CRCs come along */ - bp->b_ops = NULL; - memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); } /* @@ -394,8 +1350,7 @@ xfs_symlink_local_to_remote( * * XXX (dgc): investigate whether directory conversion can use the generic * formatting callout. It should be possible - it's just a very complex - * formatter. it would also require passing the transaction through to the init - * function. + * formatter. */ STATIC int /* error */ xfs_bmap_add_attrfork_local( @@ -432,6 +1387,640 @@ xfs_bmap_add_attrfork_local( } /* + * Convert inode from non-attributed to attributed. + * Must not be in a transaction, ip must not be locked. + */ +int /* error code */ +xfs_bmap_add_attrfork( + xfs_inode_t *ip, /* incore inode pointer */ + int size, /* space new attribute needs */ + int rsvd) /* xact may use reserved blks */ +{ + xfs_fsblock_t firstblock; /* 1st block/ag allocated */ + xfs_bmap_free_t flist; /* freed extent records */ + xfs_mount_t *mp; /* mount structure */ + xfs_trans_t *tp; /* transaction pointer */ + int blks; /* space reservation */ + int version = 1; /* superblock attr version */ + int committed; /* xaction was committed */ + int logflags; /* logging flags */ + int error; /* error return value */ + + ASSERT(XFS_IFORK_Q(ip) == 0); + + mp = ip->i_mount; + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); + blks = XFS_ADDAFORK_SPACE_RES(mp); + if (rsvd) + tp->t_flags |= XFS_TRANS_RESERVE; + if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT))) + goto error0; + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? + XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + return error; + } + if (XFS_IFORK_Q(ip)) + goto error1; + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { + /* + * For inodes coming from pre-6.2 filesystems. + */ + ASSERT(ip->i_d.di_aformat == 0); + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + } + ASSERT(ip->i_d.di_anextents == 0); + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + break; + case XFS_DINODE_FMT_UUID: + ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); + if (!ip->i_d.di_forkoff) + ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; + else if (mp->m_flags & XFS_MOUNT_ATTR2) + version = 2; + break; + default: + ASSERT(0); + error = XFS_ERROR(EINVAL); + goto error1; + } + + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp->if_flags = XFS_IFEXTENTS; + logflags = 0; + xfs_bmap_init(&flist, &firstblock); + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, + &logflags); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, + &flist, &logflags); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, + &logflags); + break; + default: + error = 0; + break; + } + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (error) + goto error2; + if (!xfs_sb_version_hasattr(&mp->m_sb) || + (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { + __int64_t sbfields = 0; + + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasattr(&mp->m_sb)) { + xfs_sb_version_addattr(&mp->m_sb); + sbfields |= XFS_SB_VERSIONNUM; + } + if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { + xfs_sb_version_addattr2(&mp->m_sb); + sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); + } + if (sbfields) { + spin_unlock(&mp->m_sb_lock); + xfs_mod_sb(tp, sbfields); + } else + spin_unlock(&mp->m_sb_lock); + } + + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) + goto error2; + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); +error2: + xfs_bmap_cancel(&flist); +error1: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +error0: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + return error; +} + +/* + * Internal and external extent tree search functions. + */ + +/* + * Read in the extents to if_extents. + * All inode fields are set up by caller, we just traverse the btree + * and copy the records in. If the file system cannot contain unwritten + * extents, the records are checked for no "state" flags. + */ +int /* error */ +xfs_bmap_read_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ + xfs_extnum_t i, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + /* REFERENCED */ + xfs_extnum_t room; /* number of entries there's room for */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : + XFS_EXTFMT_INODE(ip); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) + return error; + block = XFS_BUF_TO_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, level), + error0); + if (level == 0) + break; + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + xfs_trans_brelse(tp, bp); + } + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + i = 0; + /* + * Loop over all leaf nodes. Copy information to the extent records. + */ + for (;;) { + xfs_bmbt_rec_t *frp; + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + xfs_extnum_t start; + + num_recs = xfs_btree_get_numrecs(block); + if (unlikely(i + num_recs > room)) { + ASSERT(i + num_recs <= room); + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, (btree extents).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)", + XFS_ERRLEVEL_LOW, ip->i_mount, block); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, 0), + error0); + /* + * Read-ahead the next leaf block, if any. + */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + if (nextbno != NULLFSBLOCK) + xfs_btree_reada_bufl(mp, nextbno, 1, + &xfs_bmbt_buf_ops); + /* + * Copy records into the extent records. + */ + frp = XFS_BMBT_REC_ADDR(mp, block, 1); + start = i; + for (j = 0; j < num_recs; j++, i++, frp++) { + xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); + trp->l0 = be64_to_cpu(frp->l0); + trp->l1 = be64_to_cpu(frp->l1); + } + if (exntf == XFS_EXTFMT_NOSTATE) { + /* + * Check all attribute bmap btree records and + * any "older" data bmap btree records for a + * set bit in the "extent flag" position. + */ + if (unlikely(xfs_check_nostate_extents(ifp, + start, num_recs))) { + XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + goto error0; + } + } + xfs_trans_brelse(tp, bp); + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) + return error; + block = XFS_BUF_TO_BLOCK(bp); + } + ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); + XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); + return 0; +error0: + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); +} + + +/* + * Search the extent records for the entry containing block bno. + * If bno lies in a hole, point to the next entry. If bno lies + * past eof, *eofp will be set, and *prevp will contain the last + * entry (null if none). Else, *lastxp will be set to the index + * of the found entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_multi_extents( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + xfs_extnum_t lastx; /* last extent index */ + + /* + * Initialize the extent entry structure to catch access to + * uninitialized br_startblock field. + */ + gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; + gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; + gotp->br_state = XFS_EXT_INVALID; +#if XFS_BIG_BLKNOS + gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; +#else + gotp->br_startblock = 0xffffa5a5; +#endif + prevp->br_startoff = NULLFILEOFF; + + ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); + if (lastx > 0) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); + } + if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { + xfs_bmbt_get_all(ep, gotp); + *eofp = 0; + } else { + if (lastx > 0) { + *gotp = *prevp; + } + *eofp = 1; + ep = NULL; + } + *lastxp = lastx; + return ep; +} + +/* + * Search the extents list for the inode, for the extent containing bno. + * If bno lies in a hole, point to the next entry. If bno lies past eof, + * *eofp will be set, and *prevp will contain the last entry (null if none). + * Else, *lastxp will be set to the index of the found + * entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_extents( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int fork, /* data or attr fork */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + + XFS_STATS_INC(xs_look_exlist); + ifp = XFS_IFORK_PTR(ip, fork); + + ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); + + if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && + !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x lastx: %x\n", + (unsigned long long)ip->i_ino, + (unsigned long long)gotp->br_startblock, + (unsigned long long)gotp->br_startoff, + (unsigned long long)gotp->br_blockcount, + gotp->br_state, *lastxp); + *lastxp = NULLEXTNUM; + *eofp = 1; + return NULL; + } + return ep; +} + +/* + * Returns the file-relative block number of the first unused block(s) + * in the file with at least "len" logically contiguous blocks free. + * This is the lowest-address hole if the file has holes, else the first block + * past the end of file. + * Return 0 if the file is currently local (in-inode). + */ +int /* error */ +xfs_bmap_first_unused( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_extlen_t len, /* size of hole to find */ + xfs_fileoff_t *first_unused, /* unused block */ + int whichfork) /* data or attr fork */ +{ + int error; /* error return value */ + int idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t lastaddr; /* last block number seen */ + xfs_fileoff_t lowest; /* lowest useful block */ + xfs_fileoff_t max; /* starting useful block */ + xfs_fileoff_t off; /* offset for this block */ + xfs_extnum_t nextents; /* number of extent entries */ + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *first_unused = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + lowest = *first_unused; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); + off = xfs_bmbt_get_startoff(ep); + /* + * See if the hole before this extent will work. + */ + if (off >= lowest + len && off - max >= len) { + *first_unused = max; + return 0; + } + lastaddr = off + xfs_bmbt_get_blockcount(ep); + max = XFS_FILEOFF_MAX(lastaddr, lowest); + } + *first_unused = max; + return 0; +} + +/* + * Returns the file-relative block number of the last block + 1 before + * last_block (input value) in the file. + * This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int /* error */ +xfs_bmap_last_before( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ +{ + xfs_fileoff_t bno; /* input file offset */ + int eof; /* hit end of file */ + xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ + int error; /* error return value */ + xfs_bmbt_irec_t got; /* current extent value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent used */ + xfs_bmbt_irec_t prev; /* previous extent value */ + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EIO); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *last_block = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + bno = *last_block - 1; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + if (eof || xfs_bmbt_get_startoff(ep) > bno) { + if (prev.br_startoff == NULLFILEOFF) + *last_block = 0; + else + *last_block = prev.br_startoff + prev.br_blockcount; + } + /* + * Otherwise *last_block is already the right answer. + */ + return 0; +} + +STATIC int +xfs_bmap_last_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *rec, + int *is_empty) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int error; + int nextents; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *is_empty = 1; + return 0; + } + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); + *is_empty = 0; + return 0; +} + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + * + * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be + * at, or past the EOF. + */ +STATIC int +xfs_bmap_isaeof( + struct xfs_bmalloca *bma, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + bma->aeof = 0; + error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, + &is_empty); + if (error || is_empty) + return error; + + /* + * Check if we are allocation or past the last extent, or at least into + * the last delayed allocated extent. + */ + bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || + (bma->offset >= rec.br_startoff && + isnullstartblock(rec.br_startblock)); + return 0; +} + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary. All offsets are considered outside + * the end of file for an empty fork, so 1 is returned in *eof in that case. + */ +int +xfs_bmap_eof( + struct xfs_inode *ip, + xfs_fileoff_t endoff, + int whichfork, + int *eof) +{ + struct xfs_bmbt_irec rec; + int error; + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); + if (error || *eof) + return error; + + *eof = endoff >= rec.br_startoff + rec.br_blockcount; + return 0; +} + +/* + * Returns the file-relative block number of the first block past eof in + * the file. This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int +xfs_bmap_last_offset( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t *last_block, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + *last_block = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) + return 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return XFS_ERROR(EIO); + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); + if (error || is_empty) + return error; + + *last_block = rec.br_startoff + rec.br_blockcount; + return 0; +} + +/* + * Returns whether the selected fork of the inode has exactly one + * block or not. For the data fork we check this matches di_size, + * implying the file's range is 0..bsize-1. + */ +int /* 1=>1 block, 0=>otherwise */ +xfs_bmap_one_block( + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int rval; /* return value */ + xfs_bmbt_irec_t s; /* internal version of extent */ + +#ifndef DEBUG + if (whichfork == XFS_DATA_FORK) + return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; +#endif /* !DEBUG */ + if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) + return 0; + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return 0; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_get_all(ep, &s); + rval = s.br_startoff == 0 && s.br_blockcount == 1; + if (rval && whichfork == XFS_DATA_FORK) + ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); + return rval; +} + +/* + * Extent tree manipulation functions used during allocation. + */ + +/* * Convert a delayed allocation to a real allocation. */ STATIC int /* error */ @@ -1894,6 +3483,10 @@ done: } /* + * Functions used in the extent read, allocate and remove paths + */ + +/* * Adjust the size of the new extent based on di_extsize and rt extsize. */ STATIC int @@ -2666,1628 +4259,6 @@ xfs_bmap_alloc( } /* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. - */ -STATIC int /* error */ -xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - /* REFERENCED */ - struct xfs_btree_block *cblock;/* child btree block */ - xfs_fsblock_t cbno; /* child block number */ - xfs_buf_t *cbp; /* child block's buffer */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork data */ - xfs_mount_t *mp; /* mount point structure */ - __be64 *pp; /* ptr to block address */ - struct xfs_btree_block *rblock;/* root btree block */ - - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); - rblock = ifp->if_broot; - ASSERT(be16_to_cpu(rblock->bb_level) == 1); - ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); - ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); - cbno = be64_to_cpu(*pp); - *logflagsp = 0; -#ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, cbno, 1))) - return error; -#endif - error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - cblock = XFS_BUF_TO_BLOCK(cbp); - if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) - return error; - xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); - ip->i_d.di_nblocks--; - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); - xfs_trans_binval(tp, cbp); - if (cur->bc_bufs[0] == cbp) - cur->bc_bufs[0] = NULL; - xfs_iroot_realloc(ip, -1, whichfork); - ASSERT(ifp->if_broot == NULL); - ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); - return 0; -} - -/* - * Called by xfs_bmapi to update file extent records and the btree - * after removing space (or undoing a delayed allocation). - */ -STATIC int /* error */ -xfs_bmap_del_extent( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_trans_t *tp, /* current transaction pointer */ - xfs_extnum_t *idx, /* extent number to update/delete */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *del, /* data to remove from extents */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ - xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ - xfs_fsblock_t del_endblock=0; /* first block past del */ - xfs_fileoff_t del_endoff; /* first offset past del */ - int delay; /* current block is delayed allocated */ - int do_fx; /* free extent at end of routine */ - xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ - int error; /* error return value */ - int flags; /* inode logging flags */ - xfs_bmbt_irec_t got; /* current extent entry */ - xfs_fileoff_t got_endoff; /* first offset past got */ - int i; /* temp state */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t nblks; /* quota/sb block count */ - xfs_bmbt_irec_t new; /* new record to be inserted */ - /* REFERENCED */ - uint qfield; /* quota field to update */ - xfs_filblks_t temp; /* for indirect length calculations */ - xfs_filblks_t temp2; /* for indirect length calculations */ - int state = 0; - - XFS_STATS_INC(xs_del_exlist); - - if (whichfork == XFS_ATTR_FORK) - state |= BMAP_ATTRFORK; - - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t))); - ASSERT(del->br_blockcount > 0); - ep = xfs_iext_get_ext(ifp, *idx); - xfs_bmbt_get_all(ep, &got); - ASSERT(got.br_startoff <= del->br_startoff); - del_endoff = del->br_startoff + del->br_blockcount; - got_endoff = got.br_startoff + got.br_blockcount; - ASSERT(got_endoff >= del_endoff); - delay = isnullstartblock(got.br_startblock); - ASSERT(isnullstartblock(del->br_startblock) == delay); - flags = 0; - qfield = 0; - error = 0; - /* - * If deleting a real allocation, must free up the disk space. - */ - if (!delay) { - flags = XFS_ILOG_CORE; - /* - * Realtime allocation. Free it and record di_nblocks update. - */ - if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { - xfs_fsblock_t bno; - xfs_filblks_t len; - - ASSERT(do_mod(del->br_blockcount, - mp->m_sb.sb_rextsize) == 0); - ASSERT(do_mod(del->br_startblock, - mp->m_sb.sb_rextsize) == 0); - bno = del->br_startblock; - len = del->br_blockcount; - do_div(bno, mp->m_sb.sb_rextsize); - do_div(len, mp->m_sb.sb_rextsize); - error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); - if (error) - goto done; - do_fx = 0; - nblks = len * mp->m_sb.sb_rextsize; - qfield = XFS_TRANS_DQ_RTBCOUNT; - } - /* - * Ordinary allocation. - */ - else { - do_fx = 1; - nblks = del->br_blockcount; - qfield = XFS_TRANS_DQ_BCOUNT; - } - /* - * Set up del_endblock and cur for later. - */ - del_endblock = del->br_startblock + del->br_blockcount; - if (cur) { - if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff, - got.br_startblock, got.br_blockcount, - &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - } - da_old = da_new = 0; - } else { - da_old = startblockval(got.br_startblock); - da_new = 0; - nblks = 0; - do_fx = 0; - } - /* - * Set flag value to use in switch statement. - * Left-contig is 2, right-contig is 1. - */ - switch (((got.br_startoff == del->br_startoff) << 1) | - (got_endoff == del_endoff)) { - case 3: - /* - * Matches the whole extent. Delete the entry. - */ - xfs_iext_remove(ip, *idx, 1, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); - --*idx; - if (delay) - break; - - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); - flags |= XFS_ILOG_CORE; - if (!cur) { - flags |= xfs_ilog_fext(whichfork); - break; - } - if ((error = xfs_btree_delete(cur, &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - break; - - case 2: - /* - * Deleting the first part of the extent. - */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_startoff(ep, del_endoff); - temp = got.br_blockcount - del->br_blockcount; - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - da_new = temp; - break; - } - xfs_bmbt_set_startblock(ep, del_endblock); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - if (!cur) { - flags |= xfs_ilog_fext(whichfork); - break; - } - if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, - got.br_blockcount - del->br_blockcount, - got.br_state))) - goto done; - break; - - case 1: - /* - * Deleting the last part of the extent. - */ - temp = got.br_blockcount - del->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - da_new = temp; - break; - } - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - if (!cur) { - flags |= xfs_ilog_fext(whichfork); - break; - } - if ((error = xfs_bmbt_update(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount - del->br_blockcount, - got.br_state))) - goto done; - break; - - case 0: - /* - * Deleting the middle of the extent. - */ - temp = del->br_startoff - got.br_startoff; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - new.br_startoff = del_endoff; - temp2 = got_endoff - del_endoff; - new.br_blockcount = temp2; - new.br_state = got.br_state; - if (!delay) { - new.br_startblock = del_endblock; - flags |= XFS_ILOG_CORE; - if (cur) { - if ((error = xfs_bmbt_update(cur, - got.br_startoff, - got.br_startblock, temp, - got.br_state))) - goto done; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto done; - cur->bc_rec.b = new; - error = xfs_btree_insert(cur, &i); - if (error && error != ENOSPC) - goto done; - /* - * If get no-space back from btree insert, - * it tried a split, and we have a zero - * block reservation. - * Fix up our state and return the error. - */ - if (error == ENOSPC) { - /* - * Reset the cursor, don't trust - * it after any insert operation. - */ - if ((error = xfs_bmbt_lookup_eq(cur, - got.br_startoff, - got.br_startblock, - temp, &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - /* - * Update the btree record back - * to the original value. - */ - if ((error = xfs_bmbt_update(cur, - got.br_startoff, - got.br_startblock, - got.br_blockcount, - got.br_state))) - goto done; - /* - * Reset the extent record back - * to the original value. - */ - xfs_bmbt_set_blockcount(ep, - got.br_blockcount); - flags = 0; - error = XFS_ERROR(ENOSPC); - goto done; - } - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - } else - flags |= xfs_ilog_fext(whichfork); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); - } else { - ASSERT(whichfork == XFS_DATA_FORK); - temp = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - temp2 = xfs_bmap_worst_indlen(ip, temp2); - new.br_startblock = nullstartblock((int)temp2); - da_new = temp + temp2; - while (da_new > da_old) { - if (temp) { - temp--; - da_new--; - xfs_bmbt_set_startblock(ep, - nullstartblock((int)temp)); - } - if (da_new == da_old) - break; - if (temp2) { - temp2--; - da_new--; - new.br_startblock = - nullstartblock((int)temp2); - } - } - } - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_insert(ip, *idx + 1, 1, &new, state); - ++*idx; - break; - } - /* - * If we need to, add to list of extents to delete. - */ - if (do_fx) - xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, - mp); - /* - * Adjust inode # blocks in the file. - */ - if (nblks) - ip->i_d.di_nblocks -= nblks; - /* - * Adjust quota data. - */ - if (qfield) - xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); - - /* - * Account for change in delayed indirect blocks. - * Nothing to do for disk quota accounting here. - */ - ASSERT(da_old >= da_new); - if (da_old > da_new) { - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - da_new), 0); - } -done: - *logflagsp = flags; - return error; -} - -/* - * Remove the entry "free" from the free item list. Prev points to the - * previous entry, unless "free" is the head of the list. - */ -STATIC void -xfs_bmap_del_free( - xfs_bmap_free_t *flist, /* free item list header */ - xfs_bmap_free_item_t *prev, /* previous item on list, if any */ - xfs_bmap_free_item_t *free) /* list item to be freed */ -{ - if (prev) - prev->xbfi_next = free->xbfi_next; - else - flist->xbf_first = free->xbfi_next; - flist->xbf_count--; - kmem_zone_free(xfs_bmap_free_item_zone, free); -} - -/* - * Convert an extents-format file into a btree-format file. - * The new file will have a root block (in the inode) and a single child block. - */ -STATIC int /* error */ -xfs_bmap_extents_to_btree( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first-block-allocated */ - xfs_bmap_free_t *flist, /* blocks freed in xaction */ - xfs_btree_cur_t **curp, /* cursor returned to caller */ - int wasdel, /* converting a delayed alloc */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - struct xfs_btree_block *ablock; /* allocated (child) bt block */ - xfs_buf_t *abp; /* buffer for ablock */ - xfs_alloc_arg_t args; /* allocation arguments */ - xfs_bmbt_rec_t *arp; /* child record pointer */ - struct xfs_btree_block *block; /* btree root block */ - xfs_btree_cur_t *cur; /* bmap btree cursor */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - int error; /* error return value */ - xfs_extnum_t i, cnt; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_key_t *kp; /* root block key pointer */ - xfs_mount_t *mp; /* mount structure */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_bmbt_ptr_t *pp; /* root block address pointer */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); - - /* - * Make space in the inode incore. - */ - xfs_iroot_realloc(ip, 1, whichfork); - ifp->if_flags |= XFS_IFBROOT; - - /* - * Fill in the root. - */ - block = ifp->if_broot; - block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); - block->bb_level = cpu_to_be16(1); - block->bb_numrecs = cpu_to_be16(1); - block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); - - /* - * Need a cursor. Can't allocate until bb_level is filled in. - */ - mp = ip->i_mount; - cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.firstblock = *firstblock; - cur->bc_private.b.flist = flist; - cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; - /* - * Convert to a btree with two levels, one record in root. - */ - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); - memset(&args, 0, sizeof(args)); - args.tp = tp; - args.mp = mp; - args.firstblock = *firstblock; - if (*firstblock == NULLFSBLOCK) { - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); - } else if (flist->xbf_low) { - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = *firstblock; - } else { - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = *firstblock; - } - args.minlen = args.maxlen = args.prod = 1; - args.wasdel = wasdel; - *logflagsp = 0; - if ((error = xfs_alloc_vextent(&args))) { - xfs_iroot_realloc(ip, -1, whichfork); - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - return error; - } - /* - * Allocation can't fail, the space was reserved. - */ - ASSERT(args.fsbno != NULLFSBLOCK); - ASSERT(*firstblock == NULLFSBLOCK || - args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || - (flist->xbf_low && - args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); - *firstblock = cur->bc_private.b.firstblock = args.fsbno; - cur->bc_private.b.allocated++; - ip->i_d.di_nblocks++; - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); - abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); - /* - * Fill in the child block. - */ - abp->b_ops = &xfs_bmbt_buf_ops; - ablock = XFS_BUF_TO_BLOCK(abp); - ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); - ablock->bb_level = 0; - ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - for (cnt = i = 0; i < nextents; i++) { - ep = xfs_iext_get_ext(ifp, i); - if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { - arp->l0 = cpu_to_be64(ep->l0); - arp->l1 = cpu_to_be64(ep->l1); - arp++; cnt++; - } - } - ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); - xfs_btree_set_numrecs(ablock, cnt); - - /* - * Fill in the root key and pointer. - */ - kp = XFS_BMBT_KEY_ADDR(mp, block, 1); - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); - kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, - be16_to_cpu(block->bb_level))); - *pp = cpu_to_be64(args.fsbno); - - /* - * Do all this logging at the end so that - * the root is at the right level. - */ - xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS); - xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); - ASSERT(*curp == NULL); - *curp = cur; - *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); - return 0; -} - -/* - * Calculate the default attribute fork offset for newly created inodes. - */ -uint -xfs_default_attroffset( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - uint offset; - - if (mp->m_sb.sb_inodesize == 256) { - offset = XFS_LITINO(mp) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); - } else { - offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); - } - - ASSERT(offset < XFS_LITINO(mp)); - return offset; -} - -/* - * Helper routine to reset inode di_forkoff field when switching - * attribute fork from local to extent format - we reset it where - * possible to make space available for inline data fork extents. - */ -STATIC void -xfs_bmap_forkoff_reset( - xfs_mount_t *mp, - xfs_inode_t *ip, - int whichfork) -{ - if (whichfork == XFS_ATTR_FORK && - ip->i_d.di_format != XFS_DINODE_FMT_DEV && - ip->i_d.di_format != XFS_DINODE_FMT_UUID && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { - uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; - - if (dfl_forkoff > ip->i_d.di_forkoff) - ip->i_d.di_forkoff = dfl_forkoff; - } -} - -/* - * Convert a local file to an extents file. - * This code is out of bounds for data forks of regular files, - * since the file data needs to get logged so things will stay consistent. - * (The bmap-level manipulations are ok, though). - */ -STATIC int /* error */ -xfs_bmap_local_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated in xaction */ - xfs_extlen_t total, /* total blocks needed by transaction */ - int *logflagsp, /* inode logging flags */ - int whichfork, - void (*init_fn)(struct xfs_buf *bp, - struct xfs_inode *ip, - struct xfs_ifork *ifp)) -{ - int error; /* error return value */ - int flags; /* logging flags returned */ - xfs_ifork_t *ifp; /* inode fork pointer */ - - /* - * We don't want to deal with the case of keeping inode data inline yet. - * So sending the data fork of a regular inode is invalid. - */ - ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); - flags = 0; - error = 0; - if (ifp->if_bytes) { - xfs_alloc_arg_t args; /* allocation arguments */ - xfs_buf_t *bp; /* buffer for extent block */ - xfs_bmbt_rec_host_t *ep;/* extent record pointer */ - - ASSERT((ifp->if_flags & - (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); - memset(&args, 0, sizeof(args)); - args.tp = tp; - args.mp = ip->i_mount; - args.firstblock = *firstblock; - /* - * Allocate a block. We know we need only one, since the - * file currently fits in an inode. - */ - if (*firstblock == NULLFSBLOCK) { - args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); - args.type = XFS_ALLOCTYPE_START_BNO; - } else { - args.fsbno = *firstblock; - args.type = XFS_ALLOCTYPE_NEAR_BNO; - } - args.total = total; - args.minlen = args.maxlen = args.prod = 1; - error = xfs_alloc_vextent(&args); - if (error) - goto done; - - /* Can't fail, the space was reserved. */ - ASSERT(args.fsbno != NULLFSBLOCK); - ASSERT(args.len == 1); - *firstblock = args.fsbno; - bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); - - /* initialise the block and copy the data */ - init_fn(bp, ip, ifp); - - /* account for the change in fork size and log everything */ - xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); - xfs_bmap_forkoff_reset(args.mp, ip, whichfork); - xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); - xfs_iext_add(ifp, 0, 1); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); - trace_xfs_bmap_post_update(ip, 0, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, - _THIS_IP_); - XFS_IFORK_NEXT_SET(ip, whichfork, 1); - ip->i_d.di_nblocks = 1; - xfs_trans_mod_dquot_byino(tp, ip, - XFS_TRANS_DQ_BCOUNT, 1L); - flags |= xfs_ilog_fext(whichfork); - } else { - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); - xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); - } - ifp->if_flags &= ~XFS_IFINLINE; - ifp->if_flags |= XFS_IFEXTENTS; - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - flags |= XFS_ILOG_CORE; -done: - *logflagsp = flags; - return error; -} - -/* - * Search the extent records for the entry containing block bno. - * If bno lies in a hole, point to the next entry. If bno lies - * past eof, *eofp will be set, and *prevp will contain the last - * entry (null if none). Else, *lastxp will be set to the index - * of the found entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_multi_extents( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - xfs_extnum_t lastx; /* last extent index */ - - /* - * Initialize the extent entry structure to catch access to - * uninitialized br_startblock field. - */ - gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; - gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; - gotp->br_state = XFS_EXT_INVALID; -#if XFS_BIG_BLKNOS - gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; -#else - gotp->br_startblock = 0xffffa5a5; -#endif - prevp->br_startoff = NULLFILEOFF; - - ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); - if (lastx > 0) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); - } - if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { - xfs_bmbt_get_all(ep, gotp); - *eofp = 0; - } else { - if (lastx > 0) { - *gotp = *prevp; - } - *eofp = 1; - ep = NULL; - } - *lastxp = lastx; - return ep; -} - -/* - * Search the extents list for the inode, for the extent containing bno. - * If bno lies in a hole, point to the next entry. If bno lies past eof, - * *eofp will be set, and *prevp will contain the last entry (null if none). - * Else, *lastxp will be set to the index of the found - * entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_extents( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int fork, /* data or attr fork */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - - XFS_STATS_INC(xs_look_exlist); - ifp = XFS_IFORK_PTR(ip, fork); - - ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); - - if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && - !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { - xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, - "Access to block zero in inode %llu " - "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x lastx: %x\n", - (unsigned long long)ip->i_ino, - (unsigned long long)gotp->br_startblock, - (unsigned long long)gotp->br_startoff, - (unsigned long long)gotp->br_blockcount, - gotp->br_state, *lastxp); - *lastxp = NULLEXTNUM; - *eofp = 1; - return NULL; - } - return ep; -} - -/* - * Compute the worst-case number of indirect blocks that will be used - * for ip's delayed extent of length "len". - */ -STATIC xfs_filblks_t -xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len) /* delayed extent length */ -{ - int level; /* btree level number */ - int maxrecs; /* maximum record count at this level */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t rval; /* return value */ - - mp = ip->i_mount; - maxrecs = mp->m_bmap_dmxr[0]; - for (level = 0, rval = 0; - level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); - level++) { - len += maxrecs - 1; - do_div(len, maxrecs); - rval += len; - if (len == 1) - return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - - level - 1; - if (level == 0) - maxrecs = mp->m_bmap_dmxr[1]; - } - return rval; -} - -/* - * Convert inode from non-attributed to attributed. - * Must not be in a transaction, ip must not be locked. - */ -int /* error code */ -xfs_bmap_add_attrfork( - xfs_inode_t *ip, /* incore inode pointer */ - int size, /* space new attribute needs */ - int rsvd) /* xact may use reserved blks */ -{ - xfs_fsblock_t firstblock; /* 1st block/ag allocated */ - xfs_bmap_free_t flist; /* freed extent records */ - xfs_mount_t *mp; /* mount structure */ - xfs_trans_t *tp; /* transaction pointer */ - int blks; /* space reservation */ - int version = 1; /* superblock attr version */ - int committed; /* xaction was committed */ - int logflags; /* logging flags */ - int error; /* error return value */ - - ASSERT(XFS_IFORK_Q(ip) == 0); - - mp = ip->i_mount; - ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); - blks = XFS_ADDAFORK_SPACE_RES(mp); - if (rsvd) - tp->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT))) - goto error0; - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? - XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); - return error; - } - if (XFS_IFORK_Q(ip)) - goto error1; - if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { - /* - * For inodes coming from pre-6.2 filesystems. - */ - ASSERT(ip->i_d.di_aformat == 0); - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - } - ASSERT(ip->i_d.di_anextents == 0); - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_DEV: - ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; - break; - case XFS_DINODE_FMT_UUID: - ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; - break; - case XFS_DINODE_FMT_LOCAL: - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); - if (!ip->i_d.di_forkoff) - ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; - else if (mp->m_flags & XFS_MOUNT_ATTR2) - version = 2; - break; - default: - ASSERT(0); - error = XFS_ERROR(EINVAL); - goto error1; - } - - ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); - ip->i_afp->if_flags = XFS_IFEXTENTS; - logflags = 0; - xfs_bmap_init(&flist, &firstblock); - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_LOCAL: - error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, - &logflags); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, - &flist, &logflags); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, - &logflags); - break; - default: - error = 0; - break; - } - if (logflags) - xfs_trans_log_inode(tp, ip, logflags); - if (error) - goto error2; - if (!xfs_sb_version_hasattr(&mp->m_sb) || - (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { - __int64_t sbfields = 0; - - spin_lock(&mp->m_sb_lock); - if (!xfs_sb_version_hasattr(&mp->m_sb)) { - xfs_sb_version_addattr(&mp->m_sb); - sbfields |= XFS_SB_VERSIONNUM; - } - if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { - xfs_sb_version_addattr2(&mp->m_sb); - sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); - } - if (sbfields) { - spin_unlock(&mp->m_sb_lock); - xfs_mod_sb(tp, sbfields); - } else - spin_unlock(&mp->m_sb_lock); - } - - error = xfs_bmap_finish(&tp, &flist, &committed); - if (error) - goto error2; - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); -error2: - xfs_bmap_cancel(&flist); -error1: - xfs_iunlock(ip, XFS_ILOCK_EXCL); -error0: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - return error; -} - -/* - * Add the extent to the list of extents to be free at transaction end. - * The list is maintained sorted (by block number). - */ -/* ARGSUSED */ -void -xfs_bmap_add_free( - xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len, /* length of extent */ - xfs_bmap_free_t *flist, /* list of extents */ - xfs_mount_t *mp) /* mount point structure */ -{ - xfs_bmap_free_item_t *cur; /* current (next) element */ - xfs_bmap_free_item_t *new; /* new element */ - xfs_bmap_free_item_t *prev; /* previous element */ -#ifdef DEBUG - xfs_agnumber_t agno; - xfs_agblock_t agbno; - - ASSERT(bno != NULLFSBLOCK); - ASSERT(len > 0); - ASSERT(len <= MAXEXTLEN); - ASSERT(!isnullstartblock(bno)); - agno = XFS_FSB_TO_AGNO(mp, bno); - agbno = XFS_FSB_TO_AGBNO(mp, bno); - ASSERT(agno < mp->m_sb.sb_agcount); - ASSERT(agbno < mp->m_sb.sb_agblocks); - ASSERT(len < mp->m_sb.sb_agblocks); - ASSERT(agbno + len <= mp->m_sb.sb_agblocks); -#endif - ASSERT(xfs_bmap_free_item_zone != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); - new->xbfi_startblock = bno; - new->xbfi_blockcount = (xfs_extlen_t)len; - for (prev = NULL, cur = flist->xbf_first; - cur != NULL; - prev = cur, cur = cur->xbfi_next) { - if (cur->xbfi_startblock >= bno) - break; - } - if (prev) - prev->xbfi_next = new; - else - flist->xbf_first = new; - new->xbfi_next = cur; - flist->xbf_count++; -} - -/* - * Compute and fill in the value of the maximum depth of a bmap btree - * in this filesystem. Done once, during mount. - */ -void -xfs_bmap_compute_maxlevels( - xfs_mount_t *mp, /* file system mount structure */ - int whichfork) /* data or attr fork */ -{ - int level; /* btree level */ - uint maxblocks; /* max blocks at this level */ - uint maxleafents; /* max leaf entries possible */ - int maxrootrecs; /* max records in root block */ - int minleafrecs; /* min records in leaf block */ - int minnoderecs; /* min records in node block */ - int sz; /* root block size */ - - /* - * The maximum number of extents in a file, hence the maximum - * number of leaf entries, is controlled by the type of di_nextents - * (a signed 32-bit number, xfs_extnum_t), or by di_anextents - * (a signed 16-bit number, xfs_aextnum_t). - * - * Note that we can no longer assume that if we are in ATTR1 that - * the fork offset of all the inodes will be - * (xfs_default_attroffset(ip) >> 3) because we could have mounted - * with ATTR2 and then mounted back with ATTR1, keeping the - * di_forkoff's fixed but probably at various positions. Therefore, - * for both ATTR1 and ATTR2 we have to assume the worst case scenario - * of a minimum size available. - */ - if (whichfork == XFS_DATA_FORK) { - maxleafents = MAXEXTNUM; - sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); - } else { - maxleafents = MAXAEXTNUM; - sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); - } - maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0); - minleafrecs = mp->m_bmap_dmnr[0]; - minnoderecs = mp->m_bmap_dmnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; - for (level = 1; maxblocks > 1; level++) { - if (maxblocks <= maxrootrecs) - maxblocks = 1; - else - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; - } - mp->m_bm_maxlevels[whichfork] = level; -} - -/* - * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi - * caller. Frees all the extents that need freeing, which must be done - * last due to locking considerations. We never free any extents in - * the first transaction. - * - * Return 1 if the given transaction was committed and a new one - * started, and 0 otherwise in the committed parameter. - */ -int /* error */ -xfs_bmap_finish( - xfs_trans_t **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *committed) /* xact committed or not */ -{ - xfs_efd_log_item_t *efd; /* extent free data */ - xfs_efi_log_item_t *efi; /* extent free intention */ - int error; /* error return value */ - xfs_bmap_free_item_t *free; /* free extent item */ - unsigned int logres; /* new log reservation */ - unsigned int logcount; /* new log count */ - xfs_mount_t *mp; /* filesystem mount structure */ - xfs_bmap_free_item_t *next; /* next item on free list */ - xfs_trans_t *ntp; /* new transaction pointer */ - - ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - if (flist->xbf_count == 0) { - *committed = 0; - return 0; - } - ntp = *tp; - efi = xfs_trans_get_efi(ntp, flist->xbf_count); - for (free = flist->xbf_first; free; free = free->xbfi_next) - xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, - free->xbfi_blockcount); - logres = ntp->t_log_res; - logcount = ntp->t_log_count; - ntp = xfs_trans_dup(*tp); - error = xfs_trans_commit(*tp, 0); - *tp = ntp; - *committed = 1; - /* - * We have a new transaction, so we should return committed=1, - * even though we're returning an error. - */ - if (error) - return error; - - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(ntp->t_ticket); - - if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES, - logcount))) - return error; - efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); - for (free = flist->xbf_first; free != NULL; free = next) { - next = free->xbfi_next; - if ((error = xfs_free_extent(ntp, free->xbfi_startblock, - free->xbfi_blockcount))) { - /* - * The bmap free list will be cleaned up at a - * higher level. The EFI will be canceled when - * this transaction is aborted. - * Need to force shutdown here to make sure it - * happens, since this transaction may not be - * dirty yet. - */ - mp = ntp->t_mountp; - if (!XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, - (error == EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); - return error; - } - xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, - free->xbfi_blockcount); - xfs_bmap_del_free(flist, NULL, free); - } - return 0; -} - -/* - * Free up any items left in the list. - */ -void -xfs_bmap_cancel( - xfs_bmap_free_t *flist) /* list of bmap_free_items */ -{ - xfs_bmap_free_item_t *free; /* free list item */ - xfs_bmap_free_item_t *next; - - if (flist->xbf_count == 0) - return; - ASSERT(flist->xbf_first != NULL); - for (free = flist->xbf_first; free; free = next) { - next = free->xbfi_next; - xfs_bmap_del_free(flist, NULL, free); - } - ASSERT(flist->xbf_count == 0); -} - -/* - * Returns the file-relative block number of the first unused block(s) - * in the file with at least "len" logically contiguous blocks free. - * This is the lowest-address hole if the file has holes, else the first block - * past the end of file. - * Return 0 if the file is currently local (in-inode). - */ -int /* error */ -xfs_bmap_first_unused( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_extlen_t len, /* size of hole to find */ - xfs_fileoff_t *first_unused, /* unused block */ - int whichfork) /* data or attr fork */ -{ - int error; /* error return value */ - int idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_fileoff_t lastaddr; /* last block number seen */ - xfs_fileoff_t lowest; /* lowest useful block */ - xfs_fileoff_t max; /* starting useful block */ - xfs_fileoff_t off; /* offset for this block */ - xfs_extnum_t nextents; /* number of extent entries */ - - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - *first_unused = 0; - return 0; - } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - lowest = *first_unused; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); - off = xfs_bmbt_get_startoff(ep); - /* - * See if the hole before this extent will work. - */ - if (off >= lowest + len && off - max >= len) { - *first_unused = max; - return 0; - } - lastaddr = off + xfs_bmbt_get_blockcount(ep); - max = XFS_FILEOFF_MAX(lastaddr, lowest); - } - *first_unused = max; - return 0; -} - -/* - * Returns the file-relative block number of the last block + 1 before - * last_block (input value) in the file. - * This is not based on i_size, it is based on the extent records. - * Returns 0 for local files, as they do not have extent records. - */ -int /* error */ -xfs_bmap_last_before( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork) /* data or attr fork */ -{ - xfs_fileoff_t bno; /* input file offset */ - int eof; /* hit end of file */ - xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ - int error; /* error return value */ - xfs_bmbt_irec_t got; /* current extent value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last extent used */ - xfs_bmbt_irec_t prev; /* previous extent value */ - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EIO); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - *last_block = 0; - return 0; - } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - bno = *last_block - 1; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - if (eof || xfs_bmbt_get_startoff(ep) > bno) { - if (prev.br_startoff == NULLFILEOFF) - *last_block = 0; - else - *last_block = prev.br_startoff + prev.br_blockcount; - } - /* - * Otherwise *last_block is already the right answer. - */ - return 0; -} - -STATIC int -xfs_bmap_last_extent( - struct xfs_trans *tp, - struct xfs_inode *ip, - int whichfork, - struct xfs_bmbt_irec *rec, - int *is_empty) -{ - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - int error; - int nextents; - - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } - - nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *is_empty = 1; - return 0; - } - - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); - *is_empty = 0; - return 0; -} - -/* - * Check the last inode extent to determine whether this allocation will result - * in blocks being allocated at the end of the file. When we allocate new data - * blocks at the end of the file which do not start at the previous data block, - * we will try to align the new blocks at stripe unit boundaries. - * - * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be - * at, or past the EOF. - */ -STATIC int -xfs_bmap_isaeof( - struct xfs_bmalloca *bma, - int whichfork) -{ - struct xfs_bmbt_irec rec; - int is_empty; - int error; - - bma->aeof = 0; - error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, - &is_empty); - if (error || is_empty) - return error; - - /* - * Check if we are allocation or past the last extent, or at least into - * the last delayed allocated extent. - */ - bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || - (bma->offset >= rec.br_startoff && - isnullstartblock(rec.br_startblock)); - return 0; -} - -/* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary. All offsets are considered outside - * the end of file for an empty fork, so 1 is returned in *eof in that case. - */ -int -xfs_bmap_eof( - struct xfs_inode *ip, - xfs_fileoff_t endoff, - int whichfork, - int *eof) -{ - struct xfs_bmbt_irec rec; - int error; - - error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); - if (error || *eof) - return error; - - *eof = endoff >= rec.br_startoff + rec.br_blockcount; - return 0; -} - -/* - * Returns the file-relative block number of the first block past eof in - * the file. This is not based on i_size, it is based on the extent records. - * Returns 0 for local files, as they do not have extent records. - */ -int -xfs_bmap_last_offset( - struct xfs_trans *tp, - struct xfs_inode *ip, - xfs_fileoff_t *last_block, - int whichfork) -{ - struct xfs_bmbt_irec rec; - int is_empty; - int error; - - *last_block = 0; - - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) - return 0; - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - return XFS_ERROR(EIO); - - error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); - if (error || is_empty) - return error; - - *last_block = rec.br_startoff + rec.br_blockcount; - return 0; -} - -/* - * Returns whether the selected fork of the inode has exactly one - * block or not. For the data fork we check this matches di_size, - * implying the file's range is 0..bsize-1. - */ -int /* 1=>1 block, 0=>otherwise */ -xfs_bmap_one_block( - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */ - xfs_ifork_t *ifp; /* inode fork pointer */ - int rval; /* return value */ - xfs_bmbt_irec_t s; /* internal version of extent */ - -#ifndef DEBUG - if (whichfork == XFS_DATA_FORK) - return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; -#endif /* !DEBUG */ - if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) - return 0; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - return 0; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_get_all(ep, &s); - rval = s.br_startoff == 0 && s.br_blockcount == 1; - if (rval && whichfork == XFS_DATA_FORK) - ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); - return rval; -} - -STATIC int -xfs_bmap_sanity_check( - struct xfs_mount *mp, - struct xfs_buf *bp, - int level) -{ - struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - - if (block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) || - be16_to_cpu(block->bb_level) != level || - be16_to_cpu(block->bb_numrecs) == 0 || - be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) - return 0; - return 1; -} - -/* - * Read in the extents to if_extents. - * All inode fields are set up by caller, we just traverse the btree - * and copy the records in. If the file system cannot contain unwritten - * extents, the records are checked for no "state" flags. - */ -int /* error */ -xfs_bmap_read_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ -{ - struct xfs_btree_block *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_buf_t *bp; /* buffer for "block" */ - int error; /* error return value */ - xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ - xfs_extnum_t i, j; /* index into the extents list */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - /* REFERENCED */ - xfs_extnum_t room; /* number of entries there's room for */ - - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : - XFS_EXTFMT_INODE(ip); - block = ifp->if_broot; - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - ASSERT(bno != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - /* - * Go down the tree until leaf level is reached, following the first - * pointer (leftmost) at each level. - */ - while (level-- > 0) { - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); - if (error) - return error; - block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); - if (level == 0) - break; - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); - xfs_trans_brelse(tp, bp); - } - /* - * Here with bp and block set to the leftmost leaf node in the tree. - */ - room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - i = 0; - /* - * Loop over all leaf nodes. Copy information to the extent records. - */ - for (;;) { - xfs_bmbt_rec_t *frp; - xfs_fsblock_t nextbno; - xfs_extnum_t num_recs; - xfs_extnum_t start; - - num_recs = xfs_btree_get_numrecs(block); - if (unlikely(i + num_recs > room)) { - ASSERT(i + num_recs <= room); - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, (btree extents).", - (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)", - XFS_ERRLEVEL_LOW, ip->i_mount, block); - goto error0; - } - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, 0), - error0); - /* - * Read-ahead the next leaf block, if any. - */ - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - if (nextbno != NULLFSBLOCK) - xfs_btree_reada_bufl(mp, nextbno, 1, - &xfs_bmbt_buf_ops); - /* - * Copy records into the extent records. - */ - frp = XFS_BMBT_REC_ADDR(mp, block, 1); - start = i; - for (j = 0; j < num_recs; j++, i++, frp++) { - xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); - trp->l0 = be64_to_cpu(frp->l0); - trp->l1 = be64_to_cpu(frp->l1); - } - if (exntf == XFS_EXTFMT_NOSTATE) { - /* - * Check all attribute bmap btree records and - * any "older" data bmap btree records for a - * set bit in the "extent flag" position. - */ - if (unlikely(xfs_check_nostate_extents(ifp, - start, num_recs))) { - XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", - XFS_ERRLEVEL_LOW, - ip->i_mount); - goto error0; - } - } - xfs_trans_brelse(tp, bp); - bno = nextbno; - /* - * If we've reached the end, stop. - */ - if (bno == NULLFSBLOCK) - break; - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); - if (error) - return error; - block = XFS_BUF_TO_BLOCK(bp); - } - ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); - ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); - XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); - return 0; -error0: - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); -} - -#ifdef DEBUG -/* - * Add bmap trace insert entries for all the contents of the extent records. - */ -void -xfs_bmap_trace_exlist( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t cnt, /* count of entries in the list */ - int whichfork, /* data or attr fork */ - unsigned long caller_ip) -{ - xfs_extnum_t idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - int state = 0; - - if (whichfork == XFS_ATTR_FORK) - state |= BMAP_ATTRFORK; - - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); - for (idx = 0; idx < cnt; idx++) - trace_xfs_extlist(ip, idx, whichfork, caller_ip); -} - -/* - * Validate that the bmbt_irecs being returned from bmapi are valid - * given the callers original parameters. Specifically check the - * ranges of the returned irecs to ensure that they only extent beyond - * the given parameters if the XFS_BMAPI_ENTIRE flag was set. - */ -STATIC void -xfs_bmap_validate_ret( - xfs_fileoff_t bno, - xfs_filblks_t len, - int flags, - xfs_bmbt_irec_t *mval, - int nmap, - int ret_nmap) -{ - int i; /* index to map values */ - - ASSERT(ret_nmap <= nmap); - - for (i = 0; i < ret_nmap; i++) { - ASSERT(mval[i].br_blockcount > 0); - if (!(flags & XFS_BMAPI_ENTIRE)) { - ASSERT(mval[i].br_startoff >= bno); - ASSERT(mval[i].br_blockcount <= len); - ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= - bno + len); - } else { - ASSERT(mval[i].br_startoff < bno + len); - ASSERT(mval[i].br_startoff + mval[i].br_blockcount > - bno); - } - ASSERT(i == 0 || - mval[i - 1].br_startoff + mval[i - 1].br_blockcount == - mval[i].br_startoff); - ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && - mval[i].br_startblock != HOLESTARTBLOCK); - ASSERT(mval[i].br_state == XFS_EXT_NORM || - mval[i].br_state == XFS_EXT_UNWRITTEN); - } -} -#endif /* DEBUG */ - - -/* * Trim the returned map to the required bounds */ STATIC void @@ -5151,6 +5122,328 @@ error0: } /* + * Called by xfs_bmapi to update file extent records and the btree + * after removing space (or undoing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_del_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_trans_t *tp, /* current transaction pointer */ + xfs_extnum_t *idx, /* extent number to update/delete */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *del, /* data to remove from extents */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ + xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ + xfs_fsblock_t del_endblock=0; /* first block past del */ + xfs_fileoff_t del_endoff; /* first offset past del */ + int delay; /* current block is delayed allocated */ + int do_fx; /* free extent at end of routine */ + xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ + int error; /* error return value */ + int flags; /* inode logging flags */ + xfs_bmbt_irec_t got; /* current extent entry */ + xfs_fileoff_t got_endoff; /* first offset past got */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t nblks; /* quota/sb block count */ + xfs_bmbt_irec_t new; /* new record to be inserted */ + /* REFERENCED */ + uint qfield; /* quota field to update */ + xfs_filblks_t temp; /* for indirect length calculations */ + xfs_filblks_t temp2; /* for indirect length calculations */ + int state = 0; + + XFS_STATS_INC(xs_del_exlist); + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(del->br_blockcount > 0); + ep = xfs_iext_get_ext(ifp, *idx); + xfs_bmbt_get_all(ep, &got); + ASSERT(got.br_startoff <= del->br_startoff); + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got.br_startoff + got.br_blockcount; + ASSERT(got_endoff >= del_endoff); + delay = isnullstartblock(got.br_startblock); + ASSERT(isnullstartblock(del->br_startblock) == delay); + flags = 0; + qfield = 0; + error = 0; + /* + * If deleting a real allocation, must free up the disk space. + */ + if (!delay) { + flags = XFS_ILOG_CORE; + /* + * Realtime allocation. Free it and record di_nblocks update. + */ + if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { + xfs_fsblock_t bno; + xfs_filblks_t len; + + ASSERT(do_mod(del->br_blockcount, + mp->m_sb.sb_rextsize) == 0); + ASSERT(do_mod(del->br_startblock, + mp->m_sb.sb_rextsize) == 0); + bno = del->br_startblock; + len = del->br_blockcount; + do_div(bno, mp->m_sb.sb_rextsize); + do_div(len, mp->m_sb.sb_rextsize); + error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + if (error) + goto done; + do_fx = 0; + nblks = len * mp->m_sb.sb_rextsize; + qfield = XFS_TRANS_DQ_RTBCOUNT; + } + /* + * Ordinary allocation. + */ + else { + do_fx = 1; + nblks = del->br_blockcount; + qfield = XFS_TRANS_DQ_BCOUNT; + } + /* + * Set up del_endblock and cur for later. + */ + del_endblock = del->br_startblock + del->br_blockcount; + if (cur) { + if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, got.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } + da_old = da_new = 0; + } else { + da_old = startblockval(got.br_startblock); + da_new = 0; + nblks = 0; + do_fx = 0; + } + /* + * Set flag value to use in switch statement. + * Left-contig is 2, right-contig is 1. + */ + switch (((got.br_startoff == del->br_startoff) << 1) | + (got_endoff == del_endoff)) { + case 3: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, *idx, 1, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); + --*idx; + if (delay) + break; + + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + flags |= XFS_ILOG_CORE; + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + break; + + case 2: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startoff(ep, del_endoff); + temp = got.br_blockcount - del->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + da_new = temp; + break; + } + xfs_bmbt_set_startblock(ep, del_endblock); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 1: + /* + * Deleting the last part of the extent. + */ + temp = got.br_blockcount - del->br_blockcount; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + da_new = temp; + break; + } + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 0: + /* + * Deleting the middle of the extent. + */ + temp = del->br_startoff - got.br_startoff; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + new.br_startoff = del_endoff; + temp2 = got_endoff - del_endoff; + new.br_blockcount = temp2; + new.br_state = got.br_state; + if (!delay) { + new.br_startblock = del_endblock; + flags |= XFS_ILOG_CORE; + if (cur) { + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, temp, + got.br_state))) + goto done; + if ((error = xfs_btree_increment(cur, 0, &i))) + goto done; + cur->bc_rec.b = new; + error = xfs_btree_insert(cur, &i); + if (error && error != ENOSPC) + goto done; + /* + * If get no-space back from btree insert, + * it tried a split, and we have a zero + * block reservation. + * Fix up our state and return the error. + */ + if (error == ENOSPC) { + /* + * Reset the cursor, don't trust + * it after any insert operation. + */ + if ((error = xfs_bmbt_lookup_eq(cur, + got.br_startoff, + got.br_startblock, + temp, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + /* + * Update the btree record back + * to the original value. + */ + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state))) + goto done; + /* + * Reset the extent record back + * to the original value. + */ + xfs_bmbt_set_blockcount(ep, + got.br_blockcount); + flags = 0; + error = XFS_ERROR(ENOSPC); + goto done; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } else + flags |= xfs_ilog_fext(whichfork); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + } else { + ASSERT(whichfork == XFS_DATA_FORK); + temp = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + temp2 = xfs_bmap_worst_indlen(ip, temp2); + new.br_startblock = nullstartblock((int)temp2); + da_new = temp + temp2; + while (da_new > da_old) { + if (temp) { + temp--; + da_new--; + xfs_bmbt_set_startblock(ep, + nullstartblock((int)temp)); + } + if (da_new == da_old) + break; + if (temp2) { + temp2--; + da_new--; + new.br_startblock = + nullstartblock((int)temp2); + } + } + } + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_insert(ip, *idx + 1, 1, &new, state); + ++*idx; + break; + } + /* + * If we need to, add to list of extents to delete. + */ + if (do_fx) + xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, + mp); + /* + * Adjust inode # blocks in the file. + */ + if (nblks) + ip->i_d.di_nblocks -= nblks; + /* + * Adjust quota data. + */ + if (qfield) + xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); + + /* + * Account for change in delayed indirect blocks. + * Nothing to do for disk quota accounting here. + */ + ASSERT(da_old >= da_new); + if (da_old > da_new) { + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + (int64_t)(da_old - da_new), 0); + } +done: + *logflagsp = flags; + return error; +} + +/* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to * that value. If not all extents in the block range can be removed then @@ -5811,416 +6104,6 @@ xfs_getbmap( return error; } -#ifdef DEBUG -STATIC struct xfs_buf * -xfs_bmap_get_bp( - struct xfs_btree_cur *cur, - xfs_fsblock_t bno) -{ - struct xfs_log_item_desc *lidp; - int i; - - if (!cur) - return NULL; - - for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { - if (!cur->bc_bufs[i]) - break; - if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) - return cur->bc_bufs[i]; - } - - /* Chase down all the log items to see if the bp is there */ - list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { - struct xfs_buf_log_item *bip; - bip = (struct xfs_buf_log_item *)lidp->lid_item; - if (bip->bli_item.li_type == XFS_LI_BUF && - XFS_BUF_ADDR(bip->bli_buf) == bno) - return bip->bli_buf; - } - - return NULL; -} - -STATIC void -xfs_check_block( - struct xfs_btree_block *block, - xfs_mount_t *mp, - int root, - short sz) -{ - int i, j, dmxr; - __be64 *pp, *thispa; /* pointer to block address */ - xfs_bmbt_key_t *prevp, *keyp; - - ASSERT(be16_to_cpu(block->bb_level) > 0); - - prevp = NULL; - for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { - dmxr = mp->m_bmap_dmxr[0]; - keyp = XFS_BMBT_KEY_ADDR(mp, block, i); - - if (prevp) { - ASSERT(be64_to_cpu(prevp->br_startoff) < - be64_to_cpu(keyp->br_startoff)); - } - prevp = keyp; - - /* - * Compare the block numbers to see if there are dups. - */ - if (root) - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); - else - pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); - - for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { - if (root) - thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); - else - thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); - if (*thispa == *pp) { - xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", - __func__, j, i, - (unsigned long long)be64_to_cpu(*thispa)); - panic("%s: ptrs are equal in node\n", - __func__); - } - } - } -} - -/* - * Check that the extents for the inode ip are in the right order in all - * btree leaves. - */ - -STATIC void -xfs_bmap_check_leaf_extents( - xfs_btree_cur_t *cur, /* btree cursor or null */ - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork) /* data or attr fork */ -{ - struct xfs_btree_block *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_buf_t *bp; /* buffer for "block" */ - int error; /* error return value */ - xfs_extnum_t i=0, j; /* index into the extents list */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - xfs_bmbt_rec_t *ep; /* pointer to current extent */ - xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ - xfs_bmbt_rec_t *nextp; /* pointer to next extent */ - int bp_release = 0; - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { - return; - } - - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - block = ifp->if_broot; - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - xfs_check_block(block, mp, 1, ifp->if_broot_bytes); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - - ASSERT(bno != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - - /* - * Go down the tree until leaf level is reached, following the first - * pointer (leftmost) at each level. - */ - while (level-- > 0) { - /* See if buf is in cur first */ - bp_release = 0; - bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (!bp) { - bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - goto error_norelse; - } - block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); - if (level == 0) - break; - - /* - * Check this block for basic sanity (increasing keys and - * no duplicate blocks). - */ - - xfs_check_block(block, mp, 0, 0); - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); - } - } - - /* - * Here with bp and block set to the leftmost leaf node in the tree. - */ - i = 0; - - /* - * Loop over all leaf nodes checking that all extents are in the right order. - */ - for (;;) { - xfs_fsblock_t nextbno; - xfs_extnum_t num_recs; - - - num_recs = xfs_btree_get_numrecs(block); - - /* - * Read-ahead the next leaf block, if any. - */ - - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - - /* - * Check all the extents to make sure they are OK. - * If we had a previous block, the last entry should - * conform with the first entry in this one. - */ - - ep = XFS_BMBT_REC_ADDR(mp, block, 1); - if (i) { - ASSERT(xfs_bmbt_disk_get_startoff(&last) + - xfs_bmbt_disk_get_blockcount(&last) <= - xfs_bmbt_disk_get_startoff(ep)); - } - for (j = 1; j < num_recs; j++) { - nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); - ASSERT(xfs_bmbt_disk_get_startoff(ep) + - xfs_bmbt_disk_get_blockcount(ep) <= - xfs_bmbt_disk_get_startoff(nextp)); - ep = nextp; - } - - last = *ep; - i += num_recs; - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); - } - bno = nextbno; - /* - * If we've reached the end, stop. - */ - if (bno == NULLFSBLOCK) - break; - - bp_release = 0; - bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (!bp) { - bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - goto error_norelse; - } - block = XFS_BUF_TO_BLOCK(bp); - } - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); - } - return; - -error0: - xfs_warn(mp, "%s: at error0", __func__); - if (bp_release) - xfs_trans_brelse(NULL, bp); -error_norelse: - xfs_warn(mp, "%s: BAD after btree leaves for %d extents", - __func__, i); - panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); - return; -} -#endif - -/* - * Count fsblocks of the given fork. - */ -int /* error */ -xfs_bmap_count_blocks( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - int *count) /* out: count of blocks */ -{ - struct xfs_btree_block *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { - xfs_bmap_count_leaves(ifp, 0, - ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), - count); - return 0; - } - - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - block = ifp->if_broot; - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - ASSERT(bno != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - - if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { - XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, - mp); - return XFS_ERROR(EFSCORRUPTED); - } - - return 0; -} - -/* - * Recursively walks each level of a btree - * to count total fsblocks is use. - */ -STATIC int /* error */ -xfs_bmap_count_tree( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fsblock_t blockno, /* file system block number */ - int levelin, /* level in btree */ - int *count) /* Count of blocks */ -{ - int error; - xfs_buf_t *bp, *nbp; - int level = levelin; - __be64 *pp; - xfs_fsblock_t bno = blockno; - xfs_fsblock_t nextbno; - struct xfs_btree_block *block, *nextblock; - int numrecs; - - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - block = XFS_BUF_TO_BLOCK(bp); - - if (--level) { - /* Not at node above leaves, count this level of nodes */ - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - while (nextbno != NULLFSBLOCK) { - error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - nextblock = XFS_BUF_TO_BLOCK(nbp); - nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); - xfs_trans_brelse(tp, nbp); - } - - /* Dive to the next level */ - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - if (unlikely((error = - xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { - xfs_trans_brelse(tp, bp); - XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", - XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); - } - xfs_trans_brelse(tp, bp); - } else { - /* count all level 1 nodes and their leaves */ - for (;;) { - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - numrecs = be16_to_cpu(block->bb_numrecs); - xfs_bmap_disk_count_leaves(mp, block, numrecs, count); - xfs_trans_brelse(tp, bp); - if (nextbno == NULLFSBLOCK) - break; - bno = nextbno; - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - block = XFS_BUF_TO_BLOCK(bp); - } - } - return 0; -} - -/* - * Count leaf blocks given a range of extent records. - */ -STATIC void -xfs_bmap_count_leaves( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - int numrecs, - int *count) -{ - int b; - - for (b = 0; b < numrecs; b++) { - xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); - *count += xfs_bmbt_get_blockcount(frp); - } -} - -/* - * Count leaf blocks given a range of extent records originally - * in btree format. - */ -STATIC void -xfs_bmap_disk_count_leaves( - struct xfs_mount *mp, - struct xfs_btree_block *block, - int numrecs, - int *count) -{ - int b; - xfs_bmbt_rec_t *frp; - - for (b = 1; b <= numrecs; b++) { - frp = XFS_BMBT_REC_ADDR(mp, block, b); - *count += xfs_bmbt_disk_get_blockcount(frp); - } -} - /* * dead simple method of punching delalyed allocation blocks from a range in * the inode. Walks a block at a time so will be slow, but is only executed in @@ -6295,16 +6178,3 @@ next_block: return error; } - -/* - * Convert the given file system block to a disk block. We have to treat it - * differently based on whether the file is a real time file or not, because the - * bmap code does. - */ -xfs_daddr_t -xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) -{ - return (XFS_IS_REALTIME_INODE(ip) ? \ - (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ - XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); -} diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 061b45cbe614..0c61a22be6fd 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -37,6 +37,7 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trace.h" +#include "xfs_cksum.h" /* * Determine the extent state. @@ -59,24 +60,31 @@ xfs_extent_state( */ void xfs_bmdr_to_bmbt( - struct xfs_mount *mp, + struct xfs_inode *ip, xfs_bmdr_block_t *dblock, int dblocklen, struct xfs_btree_block *rblock, int rblocklen) { + struct xfs_mount *mp = ip->i_mount; int dmxr; xfs_bmbt_key_t *fkp; __be64 *fpp; xfs_bmbt_key_t *tkp; __be64 *tpp; - rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); + rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; - rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0); fkp = XFS_BMDR_KEY_ADDR(dblock, 1); tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); @@ -424,7 +432,13 @@ xfs_bmbt_to_bmdr( xfs_bmbt_key_t *tkp; __be64 *tpp; - ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)); + ASSERT(rblock->bb_u.l.bb_blkno == + cpu_to_be64(XFS_BUF_DADDR_NULL)); + } else + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO)); ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO)); ASSERT(rblock->bb_level != 0); @@ -708,59 +722,89 @@ xfs_bmbt_key_diff( cur->bc_rec.b.br_startoff; } -static void +static int xfs_bmbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); unsigned int level; - int lblock_ok; /* block passes checks */ - /* magic number and level verification. + switch (block->bb_magic) { + case cpu_to_be32(XFS_BMAP_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn) + return false; + /* + * XXX: need a better way of verifying the owner here. Right now + * just make sure there has been one set. + */ + if (be64_to_cpu(block->bb_u.l.bb_owner) == 0) + return false; + /* fall through */ + case cpu_to_be32(XFS_BMAP_MAGIC): + break; + default: + return false; + } + + /* + * numrecs and level verification. * - * We don't know waht fork we belong to, so just verify that the level + * We don't know what fork we belong to, so just verify that the level * is less than the maximum of the two. Later checks will be more * precise. */ level = be16_to_cpu(block->bb_level); - lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) && - level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]); - - /* numrecs verification */ - lblock_ok = lblock_ok && - be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0]; + if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1])) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) + return false; /* sibling pointer verification */ - lblock_ok = lblock_ok && - block->bb_u.l.bb_leftsib && - (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || - XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_leftsib))) && - block->bb_u.l.bb_rightsib && - (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || - XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_rightsib))); - - if (!lblock_ok) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } + if (!block->bb_u.l.bb_leftsib || + (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))) + return false; + if (!block->bb_u.l.bb_rightsib || + (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))) + return false; + + return true; + } static void xfs_bmbt_read_verify( struct xfs_buf *bp) { - xfs_bmbt_verify(bp); + if (!(xfs_btree_lblock_verify_crc(bp) && + xfs_bmbt_verify(bp))) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + } static void xfs_bmbt_write_verify( struct xfs_buf *bp) { - xfs_bmbt_verify(bp); + if (!xfs_bmbt_verify(bp)) { + xfs_warn(bp->b_target->bt_mount, "bmbt daddr 0x%llx failed", bp->b_bn); + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + xfs_btree_lblock_calc_crc(bp); } const struct xfs_buf_ops xfs_bmbt_buf_ops = { @@ -769,7 +813,7 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = { }; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_bmbt_keys_inorder( struct xfs_btree_cur *cur, @@ -809,7 +853,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, .buf_ops = &xfs_bmbt_buf_ops, -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, #endif @@ -838,6 +882,8 @@ xfs_bmbt_init_cursor( cur->bc_ops = &xfs_bmbt_ops; cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); cur->bc_private.b.ip = ip; diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 88469ca08696..1b726d626941 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -18,7 +18,8 @@ #ifndef __XFS_BMAP_BTREE_H__ #define __XFS_BMAP_BTREE_H__ -#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ +#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ +#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */ struct xfs_btree_cur; struct xfs_btree_block; @@ -136,10 +137,10 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* * Btree block header size depends on a superblock flag. - * - * (not quite yet, but soon) */ -#define XFS_BMBT_BLOCK_LEN(mp) XFS_BTREE_LBLOCK_LEN +#define XFS_BMBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN) #define XFS_BMBT_REC_ADDR(mp, block, index) \ ((xfs_bmbt_rec_t *) \ @@ -186,15 +187,17 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; #define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \ XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0)) -#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \ - (int)(XFS_BTREE_LBLOCK_LEN + \ +#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \ + (int)(XFS_BMBT_BLOCK_LEN(mp) + \ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) -#define XFS_BMAP_BROOT_SPACE(bb) \ - (XFS_BMAP_BROOT_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) +#define XFS_BMAP_BROOT_SPACE(mp, bb) \ + (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs))) #define XFS_BMDR_SPACE_CALC(nrecs) \ (int)(sizeof(xfs_bmdr_block_t) + \ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) +#define XFS_BMAP_BMDR_SPACE(bb) \ + (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) /* * Maximum number of bmap btree levels. @@ -204,7 +207,7 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* * Prototypes for xfs_bmap.c to call. */ -extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int, +extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int, struct xfs_btree_block *, int); extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index db010408d701..0903960410a2 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -30,9 +30,11 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" +#include "xfs_buf_item.h" #include "xfs_btree.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" /* * Cursor allocation zone. @@ -42,9 +44,13 @@ kmem_zone_t *xfs_btree_cur_zone; /* * Btree magic numbers. */ -const __uint32_t xfs_magics[XFS_BTNUM_MAX] = { - XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC +static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { + { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC }, + { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, + XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC } }; +#define xfs_btree_magic(cur) \ + xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] STATIC int /* error (0 or EFSCORRUPTED) */ @@ -54,30 +60,38 @@ xfs_btree_check_lblock( int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer for block, if any */ { - int lblock_ok; /* block passes checks */ + int lblock_ok = 1; /* block passes checks */ struct xfs_mount *mp; /* file system mount point */ mp = cur->bc_mp; - lblock_ok = - be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + lblock_ok = lblock_ok && + uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.l.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } + + lblock_ok = lblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && block->bb_u.l.bb_leftsib && (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_leftsib))) && + be64_to_cpu(block->bb_u.l.bb_leftsib))) && block->bb_u.l.bb_rightsib && (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_rightsib))); + be64_to_cpu(block->bb_u.l.bb_rightsib))); + if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK, XFS_RANDOM_BTREE_CHECK_LBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW, - mp); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); } return 0; @@ -90,16 +104,26 @@ xfs_btree_check_sblock( int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer containing block */ { + struct xfs_mount *mp; /* file system mount point */ struct xfs_buf *agbp; /* buffer for ag. freespace struct */ struct xfs_agf *agf; /* ag. freespace structure */ xfs_agblock_t agflen; /* native ag. freespace length */ - int sblock_ok; /* block passes checks */ + int sblock_ok = 1; /* block passes checks */ + mp = cur->bc_mp; agbp = cur->bc_private.a.agbp; agf = XFS_BUF_TO_AGF(agbp); agflen = be32_to_cpu(agf->agf_length); - sblock_ok = - be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + sblock_ok = sblock_ok && + uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.s.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } + + sblock_ok = sblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && @@ -109,13 +133,13 @@ xfs_btree_check_sblock( (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) && block->bb_u.s.bb_rightsib; - if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp, + + if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK, XFS_RANDOM_BTREE_CHECK_SBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_btree_check_sblock", - XFS_ERRLEVEL_LOW, cur->bc_mp, block); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); } return 0; @@ -194,6 +218,72 @@ xfs_btree_check_ptr( #endif /* + * Calculate CRC on the whole btree block and stuff it into the + * long-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. + */ +void +xfs_btree_lblock_calc_crc( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_LBLOCK_CRC_OFF); +} + +bool +xfs_btree_lblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_LBLOCK_CRC_OFF); + return true; +} + +/* + * Calculate CRC on the whole btree block and stuff it into the + * short-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. + */ +void +xfs_btree_sblock_calc_crc( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_SBLOCK_CRC_OFF); +} + +bool +xfs_btree_sblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_SBLOCK_CRC_OFF); + return true; +} + +/* * Delete the btree cursor. */ void @@ -277,10 +367,8 @@ xfs_btree_dup_cursor( *ncur = NULL; return error; } - new->bc_bufs[i] = bp; - ASSERT(!xfs_buf_geterror(bp)); - } else - new->bc_bufs[i] = NULL; + } + new->bc_bufs[i] = bp; } *ncur = new; return 0; @@ -321,9 +409,14 @@ xfs_btree_dup_cursor( */ static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) { - return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? - XFS_BTREE_LBLOCK_LEN : - XFS_BTREE_SBLOCK_LEN; + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_LBLOCK_CRC_LEN; + return XFS_BTREE_LBLOCK_LEN; + } + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_SBLOCK_CRC_LEN; + return XFS_BTREE_SBLOCK_LEN; } /* @@ -863,43 +956,85 @@ xfs_btree_set_sibling( } void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags) +{ + buf->bb_magic = cpu_to_be32(magic); + buf->bb_level = cpu_to_be16(level); + buf->bb_numrecs = cpu_to_be16(numrecs); + + if (flags & XFS_BTREE_LONG_PTRS) { + buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); + buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.l.bb_owner = cpu_to_be64(owner); + uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid); + buf->bb_u.l.bb_pad = 0; + } + } else { + /* owner is a 32 bit value on short blocks */ + __u32 __owner = (__u32)owner; + + buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.s.bb_owner = cpu_to_be32(__owner); + uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid); + } + } +} + +void xfs_btree_init_block( struct xfs_mount *mp, struct xfs_buf *bp, __u32 magic, __u16 level, __u16 numrecs, + __u64 owner, unsigned int flags) { - struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp); - - new->bb_magic = cpu_to_be32(magic); - new->bb_level = cpu_to_be16(level); - new->bb_numrecs = cpu_to_be16(numrecs); - - if (flags & XFS_BTREE_LONG_PTRS) { - new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); - } else { - new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - } + xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + magic, level, numrecs, owner, flags); } STATIC void xfs_btree_init_block_cur( struct xfs_btree_cur *cur, + struct xfs_buf *bp, int level, - int numrecs, - struct xfs_buf *bp) + int numrecs) { - xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum], - level, numrecs, cur->bc_flags); + __u64 owner; + + /* + * we can pull the owner from the cursor right now as the different + * owners align directly with the pointer size of the btree. This may + * change in future, but is safe for current users of the generic btree + * code. + */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + owner = cur->bc_private.b.ip->i_ino; + else + owner = cur->bc_private.a.agno; + + xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + xfs_btree_magic(cur), level, numrecs, + owner, cur->bc_flags); } /* * Return true if ptr is the last record in the btree and - * we need to track updateѕ to this record. The decision + * we need to track updates to this record. The decision * will be further refined in the update_lastrec method. */ STATIC int @@ -1147,6 +1282,7 @@ xfs_btree_log_keys( XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); if (bp) { + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_key_offset(cur, first), xfs_btree_key_offset(cur, last + 1) - 1); @@ -1171,6 +1307,7 @@ xfs_btree_log_recs( XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_rec_offset(cur, first), xfs_btree_rec_offset(cur, last + 1) - 1); @@ -1195,6 +1332,7 @@ xfs_btree_log_ptrs( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); int level = xfs_btree_get_level(block); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_ptr_offset(cur, first, level), xfs_btree_ptr_offset(cur, last + 1, level) - 1); @@ -1223,7 +1361,12 @@ xfs_btree_log_block( offsetof(struct xfs_btree_block, bb_numrecs), offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), - XFS_BTREE_SBLOCK_LEN + offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.s.bb_owner), + offsetof(struct xfs_btree_block, bb_u.s.bb_crc), + XFS_BTREE_SBLOCK_CRC_LEN }; static const short loffsets[] = { /* table of offsets (long) */ offsetof(struct xfs_btree_block, bb_magic), @@ -1231,17 +1374,40 @@ xfs_btree_log_block( offsetof(struct xfs_btree_block, bb_numrecs), offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), - XFS_BTREE_LBLOCK_LEN + offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.l.bb_owner), + offsetof(struct xfs_btree_block, bb_u.l.bb_crc), + offsetof(struct xfs_btree_block, bb_u.l.bb_pad), + XFS_BTREE_LBLOCK_CRC_LEN }; XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_TRACE_ARGBI(cur, bp, fields); if (bp) { + int nbits; + + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + /* + * We don't log the CRC when updating a btree + * block but instead recreate it during log + * recovery. As the log buffers have checksums + * of their own this is safe and avoids logging a crc + * update in a lot of places. + */ + if (fields == XFS_BB_ALL_BITS) + fields = XFS_BB_ALL_BITS_CRC; + nbits = XFS_BB_NUM_BITS_CRC; + } else { + nbits = XFS_BB_NUM_BITS; + } xfs_btree_offsets(fields, (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? loffsets : soffsets, - XFS_BB_NUM_BITS, &first, &last); + nbits, &first, &last); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, first, last); } else { xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, @@ -2204,7 +2370,7 @@ xfs_btree_split( goto error0; /* Fill in the btree header for the new right block. */ - xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp); + xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); /* * Split the entries between the old and the new block evenly. @@ -2378,7 +2544,17 @@ xfs_btree_new_iroot( if (error) goto error0; + /* + * we can't just memcpy() the root in for CRC enabled btree blocks. + * In that case have to also ensure the blkno remains correct + */ memcpy(cblock, block, xfs_btree_block_len(cur)); + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn); + else + cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn); + } be16_add_cpu(&block->bb_level, 1); xfs_btree_set_numrecs(block, 1); @@ -2513,7 +2689,7 @@ xfs_btree_new_root( nptr = 2; } /* Fill in the new block's btree header and log it. */ - xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp); + xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && !xfs_btree_ptr_is_null(cur, &rptr)); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index f932897194eb..55e3c7cc3c3d 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -42,11 +42,15 @@ extern kmem_zone_t *xfs_btree_cur_zone; * Generic btree header. * * This is a combination of the actual format used on disk for short and long - * format btrees. The first three fields are shared by both format, but - * the pointers are different and should be used with care. + * format btrees. The first three fields are shared by both format, but the + * pointers are different and should be used with care. * - * To get the size of the actual short or long form headers please use - * the size macros below. Never use sizeof(xfs_btree_block). + * To get the size of the actual short or long form headers please use the size + * macros below. Never use sizeof(xfs_btree_block). + * + * The blkno, crc, lsn, owner and uuid fields are only available in filesystems + * with the crc feature bit, and all accesses to them must be conditional on + * that flag. */ struct xfs_btree_block { __be32 bb_magic; /* magic number for block type */ @@ -56,10 +60,23 @@ struct xfs_btree_block { struct { __be32 bb_leftsib; __be32 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be32 bb_owner; + __le32 bb_crc; } s; /* short form pointers */ struct { __be64 bb_leftsib; __be64 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be64 bb_owner; + __le32 bb_crc; + __be32 bb_pad; /* padding for alignment */ } l; /* long form pointers */ } bb_u; /* rest */ }; @@ -67,6 +84,16 @@ struct xfs_btree_block { #define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ #define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ +/* sizes of CRC enabled btree blocks */ +#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) +#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) + + +#define XFS_BTREE_SBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.s.bb_crc) +#define XFS_BTREE_LBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.l.bb_crc) + /* * Generic key, ptr and record wrapper structures. @@ -101,13 +128,11 @@ union xfs_btree_rec { #define XFS_BB_NUMRECS 0x04 #define XFS_BB_LEFTSIB 0x08 #define XFS_BB_RIGHTSIB 0x10 +#define XFS_BB_BLKNO 0x20 #define XFS_BB_NUM_BITS 5 #define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) - -/* - * Magic numbers for btree blocks. - */ -extern const __uint32_t xfs_magics[]; +#define XFS_BB_NUM_BITS_CRC 8 +#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) /* * Generic stats interface @@ -190,7 +215,7 @@ struct xfs_btree_ops { const struct xfs_buf_ops *buf_ops; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, union xfs_btree_key *k1, @@ -256,6 +281,7 @@ typedef struct xfs_btree_cur #define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ #define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ #define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ +#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ #define XFS_BTREE_NOERROR 0 @@ -393,8 +419,20 @@ xfs_btree_init_block( __u32 magic, __u16 level, __u16 numrecs, + __u64 owner, unsigned int flags); +void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags); + /* * Common btree core entry points. */ @@ -408,6 +446,14 @@ int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); /* + * btree block CRC helpers + */ +void xfs_btree_lblock_calc_crc(struct xfs_buf *); +bool xfs_btree_lblock_verify_crc(struct xfs_buf *); +void xfs_btree_sblock_calc_crc(struct xfs_buf *); +bool xfs_btree_sblock_verify_crc(struct xfs_buf *); + +/* * Internal btree helpers also used by xfs_bmap.c. */ void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8459b5d8cb71..1b2472a46e46 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -513,6 +513,7 @@ _xfs_buf_find( xfs_alert(btp->bt_mount, "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", __func__, blkno, eofs); + WARN_ON(1); return NULL; } @@ -1022,7 +1023,9 @@ xfs_buf_iodone_work( bool read = !!(bp->b_flags & XBF_READ); bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); - if (read && bp->b_ops) + + /* only validate buffers that were read without errors */ + if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE)) bp->b_ops->verify_read(bp); if (bp->b_iodone) @@ -1647,7 +1650,7 @@ xfs_alloc_buftarg( { xfs_buftarg_t *btp; - btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); + btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index cf263476d6b4..bfc4e0c26fd3 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -140,6 +140,16 @@ xfs_buf_item_size( ASSERT(bip->bli_flags & XFS_BLI_LOGGED); + if (bip->bli_flags & XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. + * It is not being included in the transaction + * commit, so no vectors are used at all. + */ + trace_xfs_buf_item_size_ordered(bip); + return XFS_LOG_VEC_ORDERED; + } + /* * the vector count is based on the number of buffer vectors we have * dirty bits in. This will only be greater than one when we have a @@ -212,6 +222,7 @@ xfs_buf_item_format_segment( goto out; } + /* * Fill in an iovec for each set of contiguous chunks. */ @@ -262,12 +273,7 @@ xfs_buf_item_format_segment( vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; -/* - * You would think we need to bump the nvecs here too, but we do not - * this number is used by recovery, and it gets confused by the boundary - * split here - * nvecs++; - */ + nvecs++; vecp++; first_bit = next_bit; last_bit = next_bit; @@ -304,18 +310,36 @@ xfs_buf_item_format( /* * If it is an inode buffer, transfer the in-memory state to the - * format flags and clear the in-memory state. We do not transfer + * format flags and clear the in-memory state. + * + * For buffer based inode allocation, we do not transfer * this state if the inode buffer allocation has not yet been committed * to the log as setting the XFS_BLI_INODE_BUF flag will prevent * correct replay of the inode allocation. + * + * For icreate item based inode allocation, the buffers aren't written + * to the journal during allocation, and hence we should always tag the + * buffer as an inode buffer so that the correct unlinked list replay + * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) || + !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; bip->bli_flags &= ~XFS_BLI_INODE_BUF; } + if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) == + XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. It is not being + * included in the transaction commit, so don't format it. + */ + trace_xfs_buf_item_format_ordered(bip); + return; + } + for (i = 0; i < bip->bli_format_count; i++) { vecp = xfs_buf_item_format_segment(bip, vecp, offset, &bip->bli_formats[i]); @@ -345,6 +369,7 @@ xfs_buf_item_pin( ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_ORDERED) || (bip->bli_flags & XFS_BLI_STALE)); trace_xfs_buf_item_pin(bip); @@ -517,8 +542,9 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - int aborted, clean, i; - uint hold; + bool clean; + bool aborted; + int flags; /* Clear the buffer's association with this transaction. */ bp->b_transp = NULL; @@ -529,23 +555,21 @@ xfs_buf_item_unlock( * (cancelled) buffers at unpin time, but we'll never go through the * pin/unpin cycle if we abort inside commit. */ - aborted = (lip->li_flags & XFS_LI_ABORTED) != 0; - + aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; /* - * Before possibly freeing the buf item, determine if we should - * release the buffer at the end of this routine. + * Before possibly freeing the buf item, copy the per-transaction state + * so we can reference it safely later after clearing it from the + * buffer log item. */ - hold = bip->bli_flags & XFS_BLI_HOLD; - - /* Clear the per transaction state. */ - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD); + flags = bip->bli_flags; + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); /* * If the buf item is marked stale, then don't do anything. We'll * unlock the buffer and free the buf item when the buffer is unpinned * for the last time. */ - if (bip->bli_flags & XFS_BLI_STALE) { + if (flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { @@ -562,13 +586,19 @@ xfs_buf_item_unlock( * be the only reference to the buf item, so we free it anyway * regardless of whether it is dirty or not. A dirty abort implies a * shutdown, anyway. + * + * Ordered buffers are dirty but may have no recorded changes, so ensure + * we only release clean items here. */ - clean = 1; - for (i = 0; i < bip->bli_format_count; i++) { - if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, - bip->bli_formats[i].blf_map_size)) { - clean = 0; - break; + clean = (flags & XFS_BLI_DIRTY) ? false : true; + if (clean) { + int i; + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) { + clean = false; + break; + } } } if (clean) @@ -581,7 +611,7 @@ xfs_buf_item_unlock( } else atomic_dec(&bip->bli_refcount); - if (!hold) + if (!(flags & XFS_BLI_HOLD)) xfs_buf_relse(bp); } @@ -847,12 +877,6 @@ xfs_buf_item_log( struct xfs_buf *bp = bip->bli_buf; /* - * Mark the item as having some dirty data for - * quick reference in xfs_buf_item_dirty. - */ - bip->bli_flags |= XFS_BLI_DIRTY; - - /* * walk each buffer segment and mark them dirty appropriately. */ start = 0; @@ -878,7 +902,7 @@ xfs_buf_item_log( /* - * Return 1 if the buffer has some data that has been logged (at any + * Return 1 if the buffer has been logged or ordered in a transaction (at any * point, not just the current transaction) and 0 if not. */ uint @@ -912,11 +936,11 @@ void xfs_buf_item_relse( xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; trace_xfs_buf_item_relse(bp, _RET_IP_); + ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); - bip = bp->b_fspriv; bp->b_fspriv = bip->bli_item.li_bio_list; if (bp->b_fspriv == NULL) bp->b_iodone = NULL; diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index ee36c88ecfde..0f1c247dc680 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -24,19 +24,20 @@ extern kmem_zone_t *xfs_buf_item_zone; * This flag indicates that the buffer contains on disk inodes * and requires special recovery handling. */ -#define XFS_BLF_INODE_BUF 0x1 +#define XFS_BLF_INODE_BUF (1<<0) /* * This flag indicates that the buffer should not be replayed * during recovery because its blocks are being freed. */ -#define XFS_BLF_CANCEL 0x2 +#define XFS_BLF_CANCEL (1<<1) + /* * This flag indicates that the buffer contains on disk * user or group dquots and may require special recovery handling. */ -#define XFS_BLF_UDQUOT_BUF 0x4 -#define XFS_BLF_PDQUOT_BUF 0x8 -#define XFS_BLF_GDQUOT_BUF 0x10 +#define XFS_BLF_UDQUOT_BUF (1<<2) +#define XFS_BLF_PDQUOT_BUF (1<<3) +#define XFS_BLF_GDQUOT_BUF (1<<4) #define XFS_BLF_CHUNK 128 #define XFS_BLF_SHIFT 7 @@ -61,6 +62,55 @@ typedef struct xfs_buf_log_format { } xfs_buf_log_format_t; /* + * All buffers now need to tell recovery where the magic number + * is so that it can verify and calculate the CRCs on the buffer correctly + * once the changes have been replayed into the buffer. + * + * The type value is held in the upper 5 bits of the blf_flags field, which is + * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down. + */ +#define XFS_BLFT_BITS 5 +#define XFS_BLFT_SHIFT 11 +#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT) + +enum xfs_blft { + XFS_BLFT_UNKNOWN_BUF = 0, + XFS_BLFT_UDQUOT_BUF, + XFS_BLFT_PDQUOT_BUF, + XFS_BLFT_GDQUOT_BUF, + XFS_BLFT_BTREE_BUF, + XFS_BLFT_AGF_BUF, + XFS_BLFT_AGFL_BUF, + XFS_BLFT_AGI_BUF, + XFS_BLFT_DINO_BUF, + XFS_BLFT_SYMLINK_BUF, + XFS_BLFT_DIR_BLOCK_BUF, + XFS_BLFT_DIR_DATA_BUF, + XFS_BLFT_DIR_FREE_BUF, + XFS_BLFT_DIR_LEAF1_BUF, + XFS_BLFT_DIR_LEAFN_BUF, + XFS_BLFT_DA_NODE_BUF, + XFS_BLFT_ATTR_LEAF_BUF, + XFS_BLFT_ATTR_RMT_BUF, + XFS_BLFT_SB_BUF, + XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS), +}; + +static inline void +xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type) +{ + ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF); + blf->blf_flags &= ~XFS_BLFT_MASK; + blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK); +} + +static inline __uint16_t +xfs_blft_from_flags(struct xfs_buf_log_format *blf) +{ + return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT; +} + +/* * buf log item flags */ #define XFS_BLI_HOLD 0x01 @@ -70,6 +120,7 @@ typedef struct xfs_buf_log_format { #define XFS_BLI_INODE_ALLOC_BUF 0x10 #define XFS_BLI_STALE_INODE 0x20 #define XFS_BLI_INODE_BUF 0x40 +#define XFS_BLI_ORDERED 0x80 #define XFS_BLI_FLAGS \ { XFS_BLI_HOLD, "HOLD" }, \ @@ -78,7 +129,8 @@ typedef struct xfs_buf_log_format { { XFS_BLI_LOGGED, "LOGGED" }, \ { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ - { XFS_BLI_INODE_BUF, "INODE_BUF" } + { XFS_BLI_INODE_BUF, "INODE_BUF" }, \ + { XFS_BLI_ORDERED, "ORDERED" } #ifdef __KERNEL__ @@ -113,6 +165,10 @@ void xfs_buf_attach_iodone(struct xfs_buf *, void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); +void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *, + enum xfs_blft); +void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp); + #endif /* __KERNEL__ */ #endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 4d7696a02418..0b8b2a13cd24 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -38,6 +39,8 @@ #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" /* * xfs_da_btree.c @@ -52,69 +55,195 @@ /* * Routines used for growing the Btree. */ -STATIC int xfs_da_root_split(xfs_da_state_t *state, +STATIC int xfs_da3_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *existing_root, xfs_da_state_blk_t *new_child); -STATIC int xfs_da_node_split(xfs_da_state_t *state, +STATIC int xfs_da3_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *existing_blk, xfs_da_state_blk_t *split_blk, xfs_da_state_blk_t *blk_to_add, int treelevel, int *result); -STATIC void xfs_da_node_rebalance(xfs_da_state_t *state, +STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *node_blk_1, xfs_da_state_blk_t *node_blk_2); -STATIC void xfs_da_node_add(xfs_da_state_t *state, +STATIC void xfs_da3_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *old_node_blk, xfs_da_state_blk_t *new_node_blk); /* * Routines used for shrinking the Btree. */ -STATIC int xfs_da_root_join(xfs_da_state_t *state, +STATIC int xfs_da3_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk); -STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval); -STATIC void xfs_da_node_remove(xfs_da_state_t *state, +STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval); +STATIC void xfs_da3_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk); -STATIC void xfs_da_node_unbalance(xfs_da_state_t *state, +STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *src_node_blk, xfs_da_state_blk_t *dst_node_blk); /* * Utility routines. */ -STATIC uint xfs_da_node_lasthash(struct xfs_buf *bp, int *count); -STATIC int xfs_da_node_order(struct xfs_buf *node1_bp, - struct xfs_buf *node2_bp); -STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, +STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, xfs_da_state_blk_t *save_blk); -STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); -static void -xfs_da_node_verify( + +kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ + +/* + * Allocate a dir-state structure. + * We don't put them on the stack since they're large. + */ +xfs_da_state_t * +xfs_da_state_alloc(void) +{ + return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); +} + +/* + * Kill the altpath contents of a da-state structure. + */ +STATIC void +xfs_da_state_kill_altpath(xfs_da_state_t *state) +{ + int i; + + for (i = 0; i < state->altpath.active; i++) + state->altpath.blk[i].bp = NULL; + state->altpath.active = 0; +} + +/* + * Free a da-state structure. + */ +void +xfs_da_state_free(xfs_da_state_t *state) +{ + xfs_da_state_kill_altpath(state); +#ifdef DEBUG + memset((char *)state, 0, sizeof(*state)); +#endif /* DEBUG */ + kmem_zone_free(xfs_da_state_zone, state); +} + +void +xfs_da3_node_hdr_from_disk( + struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from) +{ + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); + + if (from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->__count); + to->level = be16_to_cpu(hdr3->__level); + return; + } + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.__count); + to->level = be16_to_cpu(from->hdr.__level); +} + +void +xfs_da3_node_hdr_to_disk( + struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from) +{ + ASSERT(from->magic == XFS_DA_NODE_MAGIC || + from->magic == XFS_DA3_NODE_MAGIC); + + if (from->magic == XFS_DA3_NODE_MAGIC) { + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to; + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->__count = cpu_to_be16(from->count); + hdr3->__level = cpu_to_be16(from->level); + return; + } + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.__count = cpu_to_be16(from->count); + to->hdr.__level = cpu_to_be16(from->level); +} + +static bool +xfs_da3_node_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_da_node_hdr *hdr = bp->b_addr; - int block_ok = 0; - - block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC); - block_ok = block_ok && - be16_to_cpu(hdr->level) > 0 && - be16_to_cpu(hdr->count) > 0 ; - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + struct xfs_da_intnode *hdr = bp->b_addr; + struct xfs_da3_icnode_hdr ichdr; + + xfs_da3_node_hdr_from_disk(&ichdr, hdr); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (ichdr.magic != XFS_DA3_NODE_MAGIC) + return false; + + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) + return false; + } else { + if (ichdr.magic != XFS_DA_NODE_MAGIC) + return false; } + if (ichdr.level == 0) + return false; + if (ichdr.level > XFS_DA_NODE_MAXDEPTH) + return false; + if (ichdr.count == 0) + return false; + /* + * we don't know if the node is for and attribute or directory tree, + * so only fail if the count is outside both bounds + */ + if (ichdr.count > mp->m_dir_node_ents && + ichdr.count > mp->m_attr_node_ents) + return false; + + /* XXX: hash order check? */ + + return true; } static void -xfs_da_node_write_verify( +xfs_da3_node_write_verify( struct xfs_buf *bp) { - xfs_da_node_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (!xfs_da3_node_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DA3_NODE_CRC_OFF); } /* @@ -124,40 +253,48 @@ xfs_da_node_write_verify( * format of the block being read. */ static void -xfs_da_node_read_verify( +xfs_da3_node_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_da_blkinfo *info = bp->b_addr; switch (be16_to_cpu(info->magic)) { + case XFS_DA3_NODE_MAGIC: + if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_DA3_NODE_CRC_OFF)) + break; + /* fall through */ case XFS_DA_NODE_MAGIC: - xfs_da_node_verify(bp); - break; + if (!xfs_da3_node_verify(bp)) + break; + return; case XFS_ATTR_LEAF_MAGIC: - bp->b_ops = &xfs_attr_leaf_buf_ops; + case XFS_ATTR3_LEAF_MAGIC: + bp->b_ops = &xfs_attr3_leaf_buf_ops; bp->b_ops->verify_read(bp); return; case XFS_DIR2_LEAFN_MAGIC: - bp->b_ops = &xfs_dir2_leafn_buf_ops; + case XFS_DIR3_LEAFN_MAGIC: + bp->b_ops = &xfs_dir3_leafn_buf_ops; bp->b_ops->verify_read(bp); return; default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - mp, info); - xfs_buf_ioerror(bp, EFSCORRUPTED); break; } + + /* corrupt block */ + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); } -const struct xfs_buf_ops xfs_da_node_buf_ops = { - .verify_read = xfs_da_node_read_verify, - .verify_write = xfs_da_node_write_verify, +const struct xfs_buf_ops xfs_da3_node_buf_ops = { + .verify_read = xfs_da3_node_read_verify, + .verify_write = xfs_da3_node_write_verify, }; - int -xfs_da_node_read( +xfs_da3_node_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, @@ -165,8 +302,35 @@ xfs_da_node_read( struct xfs_buf **bpp, int which_fork) { - return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - which_fork, &xfs_da_node_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + which_fork, &xfs_da3_node_buf_ops); + if (!err && tp) { + struct xfs_da_blkinfo *info = (*bpp)->b_addr; + int type; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + type = XFS_BLFT_DA_NODE_BUF; + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + type = XFS_BLFT_ATTR_LEAF_BUF; + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + type = XFS_BLFT_DIR_LEAFN_BUF; + break; + default: + type = 0; + ASSERT(0); + break; + } + xfs_trans_buf_set_type(tp, *bpp, type); + } + return err; } /*======================================================================== @@ -177,33 +341,46 @@ xfs_da_node_read( * Create the initial contents of an intermediate node. */ int -xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - struct xfs_buf **bpp, int whichfork) +xfs_da3_node_create( + struct xfs_da_args *args, + xfs_dablk_t blkno, + int level, + struct xfs_buf **bpp, + int whichfork) { - xfs_da_intnode_t *node; - struct xfs_buf *bp; - int error; - xfs_trans_t *tp; + struct xfs_da_intnode *node; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_da3_icnode_hdr ichdr = {0}; + struct xfs_buf *bp; + int error; trace_xfs_da_node_create(args); + ASSERT(level <= XFS_DA_NODE_MAXDEPTH); - tp = args->trans; error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork); if (error) return(error); - ASSERT(bp != NULL); + bp->b_ops = &xfs_da3_node_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); node = bp->b_addr; - node->hdr.info.forw = 0; - node->hdr.info.back = 0; - node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC); - node->hdr.info.pad = 0; - node->hdr.count = 0; - node->hdr.level = cpu_to_be16(level); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + ichdr.magic = XFS_DA3_NODE_MAGIC; + hdr3->info.blkno = cpu_to_be64(bp->b_bn); + hdr3->info.owner = cpu_to_be64(args->dp->i_ino); + uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid); + } else { + ichdr.magic = XFS_DA_NODE_MAGIC; + } + ichdr.level = level; + + xfs_da3_node_hdr_to_disk(node, &ichdr); xfs_trans_log_buf(tp, bp, - XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node))); - bp->b_ops = &xfs_da_node_buf_ops; *bpp = bp; return(0); } @@ -213,12 +390,18 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, * intermediate nodes, rebalance, etc. */ int /* error */ -xfs_da_split(xfs_da_state_t *state) +xfs_da3_split( + struct xfs_da_state *state) { - xfs_da_state_blk_t *oldblk, *newblk, *addblk; - xfs_da_intnode_t *node; - struct xfs_buf *bp; - int max, action, error, i; + struct xfs_da_state_blk *oldblk; + struct xfs_da_state_blk *newblk; + struct xfs_da_state_blk *addblk; + struct xfs_da_intnode *node; + struct xfs_buf *bp; + int max; + int action; + int error; + int i; trace_xfs_da_split(state->args); @@ -246,7 +429,7 @@ xfs_da_split(xfs_da_state_t *state) */ switch (oldblk->magic) { case XFS_ATTR_LEAF_MAGIC: - error = xfs_attr_leaf_split(state, oldblk, newblk); + error = xfs_attr3_leaf_split(state, oldblk, newblk); if ((error != 0) && (error != ENOSPC)) { return(error); /* GROT: attr is inconsistent */ } @@ -261,12 +444,12 @@ xfs_da_split(xfs_da_state_t *state) if (state->inleaf) { state->extraafter = 0; /* before newblk */ trace_xfs_attr_leaf_split_before(state->args); - error = xfs_attr_leaf_split(state, oldblk, + error = xfs_attr3_leaf_split(state, oldblk, &state->extrablk); } else { state->extraafter = 1; /* after newblk */ trace_xfs_attr_leaf_split_after(state->args); - error = xfs_attr_leaf_split(state, newblk, + error = xfs_attr3_leaf_split(state, newblk, &state->extrablk); } if (error) @@ -280,7 +463,7 @@ xfs_da_split(xfs_da_state_t *state) addblk = newblk; break; case XFS_DA_NODE_MAGIC: - error = xfs_da_node_split(state, oldblk, newblk, addblk, + error = xfs_da3_node_split(state, oldblk, newblk, addblk, max - i, &action); addblk->bp = NULL; if (error) @@ -298,7 +481,7 @@ xfs_da_split(xfs_da_state_t *state) /* * Update the btree to show the new hashval for this child. */ - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); } if (!addblk) return(0); @@ -308,7 +491,7 @@ xfs_da_split(xfs_da_state_t *state) */ ASSERT(state->path.active == 0); oldblk = &state->path.blk[0]; - error = xfs_da_root_split(state, oldblk, addblk); + error = xfs_da3_root_split(state, oldblk, addblk); if (error) { addblk->bp = NULL; return(error); /* GROT: dir is inconsistent */ @@ -319,8 +502,12 @@ xfs_da_split(xfs_da_state_t *state) * just got bumped because of the addition of a new root node. * There might be three blocks involved if a double split occurred, * and the original block 0 could be at any position in the list. + * + * Note: the magic numbers and sibling pointers are in the same + * physical place for both v2 and v3 headers (by design). Hence it + * doesn't matter which version of the xfs_da_intnode structure we use + * here as the result will be the same using either structure. */ - node = oldblk->bp->b_addr; if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { @@ -359,18 +546,25 @@ xfs_da_split(xfs_da_state_t *state) * the EOF, extending the inode in process. */ STATIC int /* error */ -xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2) +xfs_da3_root_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) { - xfs_da_intnode_t *node, *oldroot; - xfs_da_args_t *args; - xfs_dablk_t blkno; - struct xfs_buf *bp; - int error, size; - xfs_inode_t *dp; - xfs_trans_t *tp; - xfs_mount_t *mp; - xfs_dir2_leaf_t *leaf; + struct xfs_da_intnode *node; + struct xfs_da_intnode *oldroot; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_args *args; + struct xfs_buf *bp; + struct xfs_inode *dp; + struct xfs_trans *tp; + struct xfs_mount *mp; + struct xfs_dir2_leaf *leaf; + xfs_dablk_t blkno; + int level; + int error; + int size; trace_xfs_da_root_split(state->args); @@ -379,29 +573,65 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * to a free space somewhere. */ args = state->args; - ASSERT(args != NULL); error = xfs_da_grow_inode(args, &blkno); if (error) - return(error); + return error; + dp = args->dp; tp = args->trans; mp = state->mp; error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork); if (error) - return(error); - ASSERT(bp != NULL); + return error; node = bp->b_addr; oldroot = blk1->bp->b_addr; - if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { - size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] - - (char *)oldroot); + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { + struct xfs_da3_icnode_hdr nodehdr; + + xfs_da3_node_hdr_from_disk(&nodehdr, oldroot); + btree = xfs_da3_node_tree_p(oldroot); + size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot); + level = nodehdr.level; + + /* + * we are about to copy oldroot to bp, so set up the type + * of bp while we know exactly what it will be. + */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); } else { - ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + leaf = (xfs_dir2_leaf_t *)oldroot; - size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] - - (char *)leaf); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + size = (int)((char *)&ents[leafhdr.count] - (char *)leaf); + level = 0; + + /* + * we are about to copy oldroot to bp, so set up the type + * of bp while we know exactly what it will be. + */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); } + + /* + * we can copy most of the information in the node from one block to + * another, but for CRC enabled headers we have to make sure that the + * block specific identifiers are kept intact. We update the buffer + * directly for this. + */ memcpy(node, oldroot, size); + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node; + + node3->hdr.info.blkno = cpu_to_be64(bp->b_bn); + } xfs_trans_log_buf(tp, bp, 0, size - 1); bp->b_ops = blk1->bp->b_ops; @@ -411,20 +641,25 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, /* * Set up the new root node. */ - error = xfs_da_node_create(args, + error = xfs_da3_node_create(args, (args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0, - be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork); + level + 1, &bp, args->whichfork); if (error) - return(error); + return error; + node = bp->b_addr; - node->btree[0].hashval = cpu_to_be32(blk1->hashval); - node->btree[0].before = cpu_to_be32(blk1->blkno); - node->btree[1].hashval = cpu_to_be32(blk2->hashval); - node->btree[1].before = cpu_to_be32(blk2->blkno); - node->hdr.count = cpu_to_be16(2); + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + btree[0].hashval = cpu_to_be32(blk1->hashval); + btree[0].before = cpu_to_be32(blk1->blkno); + btree[1].hashval = cpu_to_be32(blk2->hashval); + btree[1].before = cpu_to_be32(blk2->blkno); + nodehdr.count = 2; + xfs_da3_node_hdr_to_disk(node, &nodehdr); #ifdef DEBUG - if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) { + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { ASSERT(blk1->blkno >= mp->m_dirleafblk && blk1->blkno < mp->m_dirfreeblk); ASSERT(blk2->blkno >= mp->m_dirleafblk && @@ -434,30 +669,34 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, /* Header is already logged by xfs_da_node_create */ xfs_trans_log_buf(tp, bp, - XFS_DA_LOGRANGE(node, node->btree, - sizeof(xfs_da_node_entry_t) * 2)); + XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2)); - return(0); + return 0; } /* * Split the node, rebalance, then add the new entry. */ STATIC int /* error */ -xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, - xfs_da_state_blk_t *newblk, - xfs_da_state_blk_t *addblk, - int treelevel, int *result) +xfs_da3_node_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk, + struct xfs_da_state_blk *addblk, + int treelevel, + int *result) { - xfs_da_intnode_t *node; - xfs_dablk_t blkno; - int newcount, error; - int useextra; + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + xfs_dablk_t blkno; + int newcount; + int error; + int useextra; trace_xfs_da_node_split(state->args); node = oldblk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + xfs_da3_node_hdr_from_disk(&nodehdr, node); /* * With V2 dirs the extra block is data or freespace. @@ -467,7 +706,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, /* * Do we have to split the node? */ - if ((be16_to_cpu(node->hdr.count) + newcount) > state->node_ents) { + if (nodehdr.count + newcount > state->node_ents) { /* * Allocate a new node, add to the doubly linked chain of * nodes, then move some of our excess entries into it. @@ -476,14 +715,14 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, if (error) return(error); /* GROT: dir is inconsistent */ - error = xfs_da_node_create(state->args, blkno, treelevel, + error = xfs_da3_node_create(state->args, blkno, treelevel, &newblk->bp, state->args->whichfork); if (error) return(error); /* GROT: dir is inconsistent */ newblk->blkno = blkno; newblk->magic = XFS_DA_NODE_MAGIC; - xfs_da_node_rebalance(state, oldblk, newblk); - error = xfs_da_blk_link(state, oldblk, newblk); + xfs_da3_node_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); if (error) return(error); *result = 1; @@ -495,7 +734,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Insert the new entry(s) into the correct block * (updating last hashval in the process). * - * xfs_da_node_add() inserts BEFORE the given index, + * xfs_da3_node_add() inserts BEFORE the given index, * and as a result of using node_lookup_int() we always * point to a valid entry (not after one), but a split * operation always results in a new block whose hashvals @@ -504,22 +743,23 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * If we had double-split op below us, then add the extra block too. */ node = oldblk->bp->b_addr; - if (oldblk->index <= be16_to_cpu(node->hdr.count)) { + xfs_da3_node_hdr_from_disk(&nodehdr, node); + if (oldblk->index <= nodehdr.count) { oldblk->index++; - xfs_da_node_add(state, oldblk, addblk); + xfs_da3_node_add(state, oldblk, addblk); if (useextra) { if (state->extraafter) oldblk->index++; - xfs_da_node_add(state, oldblk, &state->extrablk); + xfs_da3_node_add(state, oldblk, &state->extrablk); state->extravalid = 0; } } else { newblk->index++; - xfs_da_node_add(state, newblk, addblk); + xfs_da3_node_add(state, newblk, addblk); if (useextra) { if (state->extraafter) newblk->index++; - xfs_da_node_add(state, newblk, &state->extrablk); + xfs_da3_node_add(state, newblk, &state->extrablk); state->extravalid = 0; } } @@ -534,33 +774,53 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * NOTE: if blk2 is empty, then it will get the upper half of blk1. */ STATIC void -xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2) +xfs_da3_node_rebalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) { - xfs_da_intnode_t *node1, *node2, *tmpnode; - xfs_da_node_entry_t *btree_s, *btree_d; - int count, tmp; - xfs_trans_t *tp; + struct xfs_da_intnode *node1; + struct xfs_da_intnode *node2; + struct xfs_da_intnode *tmpnode; + struct xfs_da_node_entry *btree1; + struct xfs_da_node_entry *btree2; + struct xfs_da_node_entry *btree_s; + struct xfs_da_node_entry *btree_d; + struct xfs_da3_icnode_hdr nodehdr1; + struct xfs_da3_icnode_hdr nodehdr2; + struct xfs_trans *tp; + int count; + int tmp; + int swap = 0; trace_xfs_da_node_rebalance(state->args); node1 = blk1->bp->b_addr; node2 = blk2->bp->b_addr; + xfs_da3_node_hdr_from_disk(&nodehdr1, node1); + xfs_da3_node_hdr_from_disk(&nodehdr2, node2); + btree1 = xfs_da3_node_tree_p(node1); + btree2 = xfs_da3_node_tree_p(node2); + /* * Figure out how many entries need to move, and in which direction. * Swap the nodes around if that makes it simpler. */ - if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && - ((be32_to_cpu(node2->btree[0].hashval) < be32_to_cpu(node1->btree[0].hashval)) || - (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) < - be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) { + if (nodehdr1.count > 0 && nodehdr2.count > 0 && + ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || + (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) < + be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) { tmpnode = node1; node1 = node2; node2 = tmpnode; + xfs_da3_node_hdr_from_disk(&nodehdr1, node1); + xfs_da3_node_hdr_from_disk(&nodehdr2, node2); + btree1 = xfs_da3_node_tree_p(node1); + btree2 = xfs_da3_node_tree_p(node2); + swap = 1; } - ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT(node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - count = (be16_to_cpu(node1->hdr.count) - be16_to_cpu(node2->hdr.count)) / 2; + + count = (nodehdr1.count - nodehdr2.count) / 2; if (count == 0) return; tp = state->args->trans; @@ -571,10 +831,11 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, /* * Move elements in node2 up to make a hole. */ - if ((tmp = be16_to_cpu(node2->hdr.count)) > 0) { + tmp = nodehdr2.count; + if (tmp > 0) { tmp *= (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node2->btree[0]; - btree_d = &node2->btree[count]; + btree_s = &btree2[0]; + btree_d = &btree2[count]; memmove(btree_d, btree_s, tmp); } @@ -582,12 +843,12 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Move the req'd B-tree elements from high in node1 to * low in node2. */ - be16_add_cpu(&node2->hdr.count, count); + nodehdr2.count += count; tmp = count * (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node1->btree[be16_to_cpu(node1->hdr.count) - count]; - btree_d = &node2->btree[0]; + btree_s = &btree1[nodehdr1.count - count]; + btree_d = &btree2[0]; memcpy(btree_d, btree_s, tmp); - be16_add_cpu(&node1->hdr.count, -count); + nodehdr1.count -= count; } else { /* * Move the req'd B-tree elements from low in node2 to @@ -595,49 +856,60 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, */ count = -count; tmp = count * (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node2->btree[0]; - btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)]; + btree_s = &btree2[0]; + btree_d = &btree1[nodehdr1.count]; memcpy(btree_d, btree_s, tmp); - be16_add_cpu(&node1->hdr.count, count); + nodehdr1.count += count; + xfs_trans_log_buf(tp, blk1->bp, XFS_DA_LOGRANGE(node1, btree_d, tmp)); /* * Move elements in node2 down to fill the hole. */ - tmp = be16_to_cpu(node2->hdr.count) - count; + tmp = nodehdr2.count - count; tmp *= (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node2->btree[count]; - btree_d = &node2->btree[0]; + btree_s = &btree2[count]; + btree_d = &btree2[0]; memmove(btree_d, btree_s, tmp); - be16_add_cpu(&node2->hdr.count, -count); + nodehdr2.count -= count; } /* * Log header of node 1 and all current bits of node 2. */ + xfs_da3_node_hdr_to_disk(node1, &nodehdr1); xfs_trans_log_buf(tp, blk1->bp, - XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr))); + XFS_DA_LOGRANGE(node1, &node1->hdr, + xfs_da3_node_hdr_size(node1))); + + xfs_da3_node_hdr_to_disk(node2, &nodehdr2); xfs_trans_log_buf(tp, blk2->bp, XFS_DA_LOGRANGE(node2, &node2->hdr, - sizeof(node2->hdr) + - sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count))); + xfs_da3_node_hdr_size(node2) + + (sizeof(btree2[0]) * nodehdr2.count))); /* * Record the last hashval from each block for upward propagation. * (note: don't use the swapped node pointers) */ - node1 = blk1->bp->b_addr; - node2 = blk2->bp->b_addr; - blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval); - blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval); + if (swap) { + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; + xfs_da3_node_hdr_from_disk(&nodehdr1, node1); + xfs_da3_node_hdr_from_disk(&nodehdr2, node2); + btree1 = xfs_da3_node_tree_p(node1); + btree2 = xfs_da3_node_tree_p(node2); + } + blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval); + blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval); /* * Adjust the expected index for insertion. */ - if (blk1->index >= be16_to_cpu(node1->hdr.count)) { - blk2->index = blk1->index - be16_to_cpu(node1->hdr.count); - blk1->index = be16_to_cpu(node1->hdr.count) + 1; /* make it invalid */ + if (blk1->index >= nodehdr1.count) { + blk2->index = blk1->index - nodehdr1.count; + blk1->index = nodehdr1.count + 1; /* make it invalid */ } } @@ -645,18 +917,23 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Add a new entry to an intermediate node. */ STATIC void -xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, - xfs_da_state_blk_t *newblk) +xfs_da3_node_add( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk) { - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - int tmp; + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int tmp; trace_xfs_da_node_add(state->args); node = oldblk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + + ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count); ASSERT(newblk->blkno != 0); if (state->args->whichfork == XFS_DATA_FORK) ASSERT(newblk->blkno >= state->mp->m_dirleafblk && @@ -666,23 +943,25 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * We may need to make some room before we insert the new node. */ tmp = 0; - btree = &node->btree[ oldblk->index ]; - if (oldblk->index < be16_to_cpu(node->hdr.count)) { - tmp = (be16_to_cpu(node->hdr.count) - oldblk->index) * (uint)sizeof(*btree); - memmove(btree + 1, btree, tmp); + if (oldblk->index < nodehdr.count) { + tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree); + memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp); } - btree->hashval = cpu_to_be32(newblk->hashval); - btree->before = cpu_to_be32(newblk->blkno); + btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval); + btree[oldblk->index].before = cpu_to_be32(newblk->blkno); xfs_trans_log_buf(state->args->trans, oldblk->bp, - XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree))); - be16_add_cpu(&node->hdr.count, 1); + XFS_DA_LOGRANGE(node, &btree[oldblk->index], + tmp + sizeof(*btree))); + + nodehdr.count += 1; + xfs_da3_node_hdr_to_disk(node, &nodehdr); xfs_trans_log_buf(state->args->trans, oldblk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node))); /* * Copy the last hash value from the oldblk to propagate upwards. */ - oldblk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1 ].hashval); + oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); } /*======================================================================== @@ -694,14 +973,16 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * possibly deallocating that block, etc... */ int -xfs_da_join(xfs_da_state_t *state) +xfs_da3_join( + struct xfs_da_state *state) { - xfs_da_state_blk_t *drop_blk, *save_blk; - int action, error; + struct xfs_da_state_blk *drop_blk; + struct xfs_da_state_blk *save_blk; + int action = 0; + int error; trace_xfs_da_join(state->args); - action = 0; drop_blk = &state->path.blk[ state->path.active-1 ]; save_blk = &state->altpath.blk[ state->path.active-1 ]; ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC); @@ -722,12 +1003,12 @@ xfs_da_join(xfs_da_state_t *state) */ switch (drop_blk->magic) { case XFS_ATTR_LEAF_MAGIC: - error = xfs_attr_leaf_toosmall(state, &action); + error = xfs_attr3_leaf_toosmall(state, &action); if (error) return(error); if (action == 0) return(0); - xfs_attr_leaf_unbalance(state, drop_blk, save_blk); + xfs_attr3_leaf_unbalance(state, drop_blk, save_blk); break; case XFS_DIR2_LEAFN_MAGIC: error = xfs_dir2_leafn_toosmall(state, &action); @@ -742,18 +1023,18 @@ xfs_da_join(xfs_da_state_t *state) * Remove the offending node, fixup hashvals, * check for a toosmall neighbor. */ - xfs_da_node_remove(state, drop_blk); - xfs_da_fixhashpath(state, &state->path); - error = xfs_da_node_toosmall(state, &action); + xfs_da3_node_remove(state, drop_blk); + xfs_da3_fixhashpath(state, &state->path); + error = xfs_da3_node_toosmall(state, &action); if (error) return(error); if (action == 0) return 0; - xfs_da_node_unbalance(state, drop_blk, save_blk); + xfs_da3_node_unbalance(state, drop_blk, save_blk); break; } - xfs_da_fixhashpath(state, &state->altpath); - error = xfs_da_blk_unlink(state, drop_blk, save_blk); + xfs_da3_fixhashpath(state, &state->altpath); + error = xfs_da3_blk_unlink(state, drop_blk, save_blk); xfs_da_state_kill_altpath(state); if (error) return(error); @@ -768,9 +1049,9 @@ xfs_da_join(xfs_da_state_t *state) * we only have one entry in the root, make the child block * the new root. */ - xfs_da_node_remove(state, drop_blk); - xfs_da_fixhashpath(state, &state->path); - error = xfs_da_root_join(state, &state->path.blk[0]); + xfs_da3_node_remove(state, drop_blk); + xfs_da3_fixhashpath(state, &state->path); + error = xfs_da3_root_join(state, &state->path.blk[0]); return(error); } @@ -782,9 +1063,13 @@ xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level) if (level == 1) { ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || - magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - } else - ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) || + magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + } else { + ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); + } ASSERT(!blkinfo->forw); ASSERT(!blkinfo->back); } @@ -797,52 +1082,61 @@ xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level) * the old root to block 0 as the new root node. */ STATIC int -xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) +xfs_da3_root_join( + struct xfs_da_state *state, + struct xfs_da_state_blk *root_blk) { - xfs_da_intnode_t *oldroot; - xfs_da_args_t *args; - xfs_dablk_t child; - struct xfs_buf *bp; - int error; + struct xfs_da_intnode *oldroot; + struct xfs_da_args *args; + xfs_dablk_t child; + struct xfs_buf *bp; + struct xfs_da3_icnode_hdr oldroothdr; + struct xfs_da_node_entry *btree; + int error; trace_xfs_da_root_join(state->args); - args = state->args; - ASSERT(args != NULL); ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); + + args = state->args; oldroot = root_blk->bp->b_addr; - ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT(!oldroot->hdr.info.forw); - ASSERT(!oldroot->hdr.info.back); + xfs_da3_node_hdr_from_disk(&oldroothdr, oldroot); + ASSERT(oldroothdr.forw == 0); + ASSERT(oldroothdr.back == 0); /* * If the root has more than one child, then don't do anything. */ - if (be16_to_cpu(oldroot->hdr.count) > 1) - return(0); + if (oldroothdr.count > 1) + return 0; /* * Read in the (only) child block, then copy those bytes into * the root block's buffer and free the original child block. */ - child = be32_to_cpu(oldroot->btree[0].before); + btree = xfs_da3_node_tree_p(oldroot); + child = be32_to_cpu(btree[0].before); ASSERT(child != 0); - error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp, + error = xfs_da3_node_read(args->trans, args->dp, child, -1, &bp, args->whichfork); if (error) - return(error); - ASSERT(bp != NULL); - xfs_da_blkinfo_onlychild_validate(bp->b_addr, - be16_to_cpu(oldroot->hdr.level)); + return error; + xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); /* * This could be copying a leaf back into the root block in the case of * there only being a single leaf block left in the tree. Hence we have * to update the b_ops pointer as well to match the buffer type change - * that could occur. + * that could occur. For dir3 blocks we also need to update the block + * number in the buffer header. */ memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); root_blk->bp->b_ops = bp->b_ops; + xfs_trans_buf_copy_type(root_blk->bp, bp); + if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { + struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; + da3->blkno = cpu_to_be64(root_blk->bp->b_bn); + } xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); @@ -858,14 +1152,21 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) * If nothing can be done, return 0. */ STATIC int -xfs_da_node_toosmall(xfs_da_state_t *state, int *action) +xfs_da3_node_toosmall( + struct xfs_da_state *state, + int *action) { - xfs_da_intnode_t *node; - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *info; - int count, forward, error, retval, i; - xfs_dablk_t blkno; - struct xfs_buf *bp; + struct xfs_da_intnode *node; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *info; + xfs_dablk_t blkno; + struct xfs_buf *bp; + struct xfs_da3_icnode_hdr nodehdr; + int count; + int forward; + int error; + int retval; + int i; trace_xfs_da_node_toosmall(state->args); @@ -876,10 +1177,9 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) */ blk = &state->path.blk[ state->path.active-1 ]; info = blk->bp->b_addr; - ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); node = (xfs_da_intnode_t *)info; - count = be16_to_cpu(node->hdr.count); - if (count > (state->node_ents >> 1)) { + xfs_da3_node_hdr_from_disk(&nodehdr, node); + if (nodehdr.count > (state->node_ents >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return(0); /* blk over 50%, don't try to join */ } @@ -890,14 +1190,14 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * coalesce it with a sibling block. We choose (arbitrarily) * to merge with the forward block unless it is NULL. */ - if (count == 0) { + if (nodehdr.count == 0) { /* * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ forward = (info->forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); if (error) return(error); @@ -916,35 +1216,34 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * We prefer coalescing with the lower numbered sibling so as * to shrink a directory over time. */ + count = state->node_ents; + count -= state->node_ents >> 2; + count -= nodehdr.count; + /* start with smaller blk num */ - forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back)); + forward = nodehdr.forw < nodehdr.back; for (i = 0; i < 2; forward = !forward, i++) { if (forward) - blkno = be32_to_cpu(info->forw); + blkno = nodehdr.forw; else - blkno = be32_to_cpu(info->back); + blkno = nodehdr.back; if (blkno == 0) continue; - error = xfs_da_node_read(state->args->trans, state->args->dp, + error = xfs_da3_node_read(state->args->trans, state->args->dp, blkno, -1, &bp, state->args->whichfork); if (error) return(error); - ASSERT(bp != NULL); - node = (xfs_da_intnode_t *)info; - count = state->node_ents; - count -= state->node_ents >> 2; - count -= be16_to_cpu(node->hdr.count); node = bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - count -= be16_to_cpu(node->hdr.count); + xfs_da3_node_hdr_from_disk(&nodehdr, node); xfs_trans_brelse(state->args->trans, bp); - if (count >= 0) + + if (count - nodehdr.count >= 0) break; /* fits with at least 25% to spare */ } if (i >= 2) { *action = 0; - return(0); + return 0; } /* @@ -953,28 +1252,42 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) { - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); - if (error) { - return(error); - } - if (retval) { - *action = 0; - return(0); - } } else { - error = xfs_da_path_shift(state, &state->path, forward, + error = xfs_da3_path_shift(state, &state->path, forward, 0, &retval); - if (error) { - return(error); - } - if (retval) { - *action = 0; - return(0); - } + } + if (error) + return error; + if (retval) { + *action = 0; + return 0; } *action = 1; - return(0); + return 0; +} + +/* + * Pick up the last hashvalue from an intermediate node. + */ +STATIC uint +xfs_da3_node_lasthash( + struct xfs_buf *bp, + int *count) +{ + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + + node = bp->b_addr; + xfs_da3_node_hdr_from_disk(&nodehdr, node); + if (count) + *count = nodehdr.count; + if (!nodehdr.count) + return 0; + btree = xfs_da3_node_tree_p(node); + return be32_to_cpu(btree[nodehdr.count - 1].hashval); } /* @@ -982,13 +1295,16 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * when we stop making changes, return. */ void -xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) +xfs_da3_fixhashpath( + struct xfs_da_state *state, + struct xfs_da_state_path *path) { - xfs_da_state_blk_t *blk; - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - xfs_dahash_t lasthash=0; - int level, count; + struct xfs_da_state_blk *blk; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + xfs_dahash_t lasthash=0; + int level; + int count; trace_xfs_da_fixhashpath(state->args); @@ -1006,23 +1322,26 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) return; break; case XFS_DA_NODE_MAGIC: - lasthash = xfs_da_node_lasthash(blk->bp, &count); + lasthash = xfs_da3_node_lasthash(blk->bp, &count); if (count == 0) return; break; } for (blk--, level--; level >= 0; blk--, level--) { + struct xfs_da3_icnode_hdr nodehdr; + node = blk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - btree = &node->btree[ blk->index ]; + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); if (be32_to_cpu(btree->hashval) == lasthash) break; blk->hashval = lasthash; - btree->hashval = cpu_to_be32(lasthash); + btree[blk->index].hashval = cpu_to_be32(lasthash); xfs_trans_log_buf(state->args->trans, blk->bp, - XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); + XFS_DA_LOGRANGE(node, &btree[blk->index], + sizeof(*btree))); - lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); + lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval); } } @@ -1030,104 +1349,120 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) * Remove an entry from an intermediate node. */ STATIC void -xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) +xfs_da3_node_remove( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk) { - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - int tmp; + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int index; + int tmp; trace_xfs_da_node_remove(state->args); node = drop_blk->bp->b_addr; - ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count)); + xfs_da3_node_hdr_from_disk(&nodehdr, node); + ASSERT(drop_blk->index < nodehdr.count); ASSERT(drop_blk->index >= 0); /* * Copy over the offending entry, or just zero it out. */ - btree = &node->btree[drop_blk->index]; - if (drop_blk->index < (be16_to_cpu(node->hdr.count)-1)) { - tmp = be16_to_cpu(node->hdr.count) - drop_blk->index - 1; + index = drop_blk->index; + btree = xfs_da3_node_tree_p(node); + if (index < nodehdr.count - 1) { + tmp = nodehdr.count - index - 1; tmp *= (uint)sizeof(xfs_da_node_entry_t); - memmove(btree, btree + 1, tmp); + memmove(&btree[index], &btree[index + 1], tmp); xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, btree, tmp)); - btree = &node->btree[be16_to_cpu(node->hdr.count)-1]; + XFS_DA_LOGRANGE(node, &btree[index], tmp)); + index = nodehdr.count - 1; } - memset((char *)btree, 0, sizeof(xfs_da_node_entry_t)); + memset(&btree[index], 0, sizeof(xfs_da_node_entry_t)); xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); - be16_add_cpu(&node->hdr.count, -1); + XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index]))); + nodehdr.count -= 1; + xfs_da3_node_hdr_to_disk(node, &nodehdr); xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node))); /* * Copy the last hash value from the block to propagate upwards. */ - btree--; - drop_blk->hashval = be32_to_cpu(btree->hashval); + drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval); } /* - * Unbalance the btree elements between two intermediate nodes, + * Unbalance the elements between two intermediate nodes, * move all Btree elements from one node into another. */ STATIC void -xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk) +xfs_da3_node_unbalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) { - xfs_da_intnode_t *drop_node, *save_node; - xfs_da_node_entry_t *btree; - int tmp; - xfs_trans_t *tp; + struct xfs_da_intnode *drop_node; + struct xfs_da_intnode *save_node; + struct xfs_da_node_entry *drop_btree; + struct xfs_da_node_entry *save_btree; + struct xfs_da3_icnode_hdr drop_hdr; + struct xfs_da3_icnode_hdr save_hdr; + struct xfs_trans *tp; + int sindex; + int tmp; trace_xfs_da_node_unbalance(state->args); drop_node = drop_blk->bp->b_addr; save_node = save_blk->bp->b_addr; - ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT(save_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + xfs_da3_node_hdr_from_disk(&drop_hdr, drop_node); + xfs_da3_node_hdr_from_disk(&save_hdr, save_node); + drop_btree = xfs_da3_node_tree_p(drop_node); + save_btree = xfs_da3_node_tree_p(save_node); tp = state->args->trans; /* * If the dying block has lower hashvals, then move all the * elements in the remaining block up to make a hole. */ - if ((be32_to_cpu(drop_node->btree[0].hashval) < be32_to_cpu(save_node->btree[ 0 ].hashval)) || - (be32_to_cpu(drop_node->btree[be16_to_cpu(drop_node->hdr.count)-1].hashval) < - be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval))) - { - btree = &save_node->btree[be16_to_cpu(drop_node->hdr.count)]; - tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); - memmove(btree, &save_node->btree[0], tmp); - btree = &save_node->btree[0]; + if ((be32_to_cpu(drop_btree[0].hashval) < + be32_to_cpu(save_btree[0].hashval)) || + (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) < + be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) { + /* XXX: check this - is memmove dst correct? */ + tmp = save_hdr.count * sizeof(xfs_da_node_entry_t); + memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp); + + sindex = 0; xfs_trans_log_buf(tp, save_blk->bp, - XFS_DA_LOGRANGE(save_node, btree, - (be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) * - sizeof(xfs_da_node_entry_t))); + XFS_DA_LOGRANGE(save_node, &save_btree[0], + (save_hdr.count + drop_hdr.count) * + sizeof(xfs_da_node_entry_t))); } else { - btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)]; + sindex = save_hdr.count; xfs_trans_log_buf(tp, save_blk->bp, - XFS_DA_LOGRANGE(save_node, btree, - be16_to_cpu(drop_node->hdr.count) * - sizeof(xfs_da_node_entry_t))); + XFS_DA_LOGRANGE(save_node, &save_btree[sindex], + drop_hdr.count * sizeof(xfs_da_node_entry_t))); } /* * Move all the B-tree elements from drop_blk to save_blk. */ - tmp = be16_to_cpu(drop_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); - memcpy(btree, &drop_node->btree[0], tmp); - be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count)); + tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t); + memcpy(&save_btree[sindex], &drop_btree[0], tmp); + save_hdr.count += drop_hdr.count; + xfs_da3_node_hdr_to_disk(save_node, &save_hdr); xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, &save_node->hdr, - sizeof(save_node->hdr))); + xfs_da3_node_hdr_size(save_node))); /* * Save the last hashval in the remaining block for upward propagation. */ - save_blk->hashval = be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval); + save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval); } /*======================================================================== @@ -1146,16 +1481,24 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * pruned depth-first tree search. */ int /* error */ -xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) +xfs_da3_node_lookup_int( + struct xfs_da_state *state, + int *result) { - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *curr; - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - xfs_dablk_t blkno; - int probe, span, max, error, retval; - xfs_dahash_t hashval, btreehashval; - xfs_da_args_t *args; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *curr; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_args *args; + xfs_dablk_t blkno; + xfs_dahash_t hashval; + xfs_dahash_t btreehashval; + int probe; + int span; + int max; + int error; + int retval; args = state->args; @@ -1171,7 +1514,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) * Read the next node down in the tree. */ blk->blkno = blkno; - error = xfs_da_node_read(args->trans, args->dp, blkno, + error = xfs_da3_node_read(args->trans, args->dp, blkno, -1, &blk->bp, args->whichfork); if (error) { blk->blkno = 0; @@ -1180,66 +1523,75 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) } curr = blk->bp->b_addr; blk->magic = be16_to_cpu(curr->magic); - ASSERT(blk->magic == XFS_DA_NODE_MAGIC || - blk->magic == XFS_DIR2_LEAFN_MAGIC || - blk->magic == XFS_ATTR_LEAF_MAGIC); + + if (blk->magic == XFS_ATTR_LEAF_MAGIC || + blk->magic == XFS_ATTR3_LEAF_MAGIC) { + blk->magic = XFS_ATTR_LEAF_MAGIC; + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); + break; + } + + if (blk->magic == XFS_DIR2_LEAFN_MAGIC || + blk->magic == XFS_DIR3_LEAFN_MAGIC) { + blk->magic = XFS_DIR2_LEAFN_MAGIC; + blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL); + break; + } + + blk->magic = XFS_DA_NODE_MAGIC; + /* * Search an intermediate node for a match. */ - if (blk->magic == XFS_DA_NODE_MAGIC) { - node = blk->bp->b_addr; - max = be16_to_cpu(node->hdr.count); - blk->hashval = be32_to_cpu(node->btree[max-1].hashval); + node = blk->bp->b_addr; + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); - /* - * Binary search. (note: small blocks will skip loop) - */ - probe = span = max / 2; - hashval = args->hashval; - for (btree = &node->btree[probe]; span > 4; - btree = &node->btree[probe]) { - span /= 2; - btreehashval = be32_to_cpu(btree->hashval); - if (btreehashval < hashval) - probe += span; - else if (btreehashval > hashval) - probe -= span; - else - break; - } - ASSERT((probe >= 0) && (probe < max)); - ASSERT((span <= 4) || (be32_to_cpu(btree->hashval) == hashval)); + max = nodehdr.count; + blk->hashval = be32_to_cpu(btree[max - 1].hashval); - /* - * Since we may have duplicate hashval's, find the first - * matching hashval in the node. - */ - while ((probe > 0) && (be32_to_cpu(btree->hashval) >= hashval)) { - btree--; - probe--; - } - while ((probe < max) && (be32_to_cpu(btree->hashval) < hashval)) { - btree++; - probe++; - } + /* + * Binary search. (note: small blocks will skip loop) + */ + probe = span = max / 2; + hashval = args->hashval; + while (span > 4) { + span /= 2; + btreehashval = be32_to_cpu(btree[probe].hashval); + if (btreehashval < hashval) + probe += span; + else if (btreehashval > hashval) + probe -= span; + else + break; + } + ASSERT((probe >= 0) && (probe < max)); + ASSERT((span <= 4) || + (be32_to_cpu(btree[probe].hashval) == hashval)); - /* - * Pick the right block to descend on. - */ - if (probe == max) { - blk->index = max-1; - blkno = be32_to_cpu(node->btree[max-1].before); - } else { - blk->index = probe; - blkno = be32_to_cpu(btree->before); - } - } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { - blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); - break; - } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) { - blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL); - break; + /* + * Since we may have duplicate hashval's, find the first + * matching hashval in the node. + */ + while (probe > 0 && + be32_to_cpu(btree[probe].hashval) >= hashval) { + probe--; + } + while (probe < max && + be32_to_cpu(btree[probe].hashval) < hashval) { + probe++; + } + + /* + * Pick the right block to descend on. + */ + if (probe == max) { + blk->index = max - 1; + blkno = be32_to_cpu(btree[max - 1].before); + } else { + blk->index = probe; + blkno = be32_to_cpu(btree[probe].before); } } @@ -1254,7 +1606,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) retval = xfs_dir2_leafn_lookup_int(blk->bp, args, &blk->index, state); } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { - retval = xfs_attr_leaf_lookup_int(blk->bp, args); + retval = xfs_attr3_leaf_lookup_int(blk->bp, args); blk->index = args->index; args->blkno = blk->blkno; } else { @@ -1263,7 +1615,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) } if (((retval == ENOENT) || (retval == ENOATTR)) && (blk->hashval == args->hashval)) { - error = xfs_da_path_shift(state, &state->path, 1, 1, + error = xfs_da3_path_shift(state, &state->path, 1, 1, &retval); if (error) return(error); @@ -1285,16 +1637,52 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) *========================================================================*/ /* + * Compare two intermediate nodes for "order". + */ +STATIC int +xfs_da3_node_order( + struct xfs_buf *node1_bp, + struct xfs_buf *node2_bp) +{ + struct xfs_da_intnode *node1; + struct xfs_da_intnode *node2; + struct xfs_da_node_entry *btree1; + struct xfs_da_node_entry *btree2; + struct xfs_da3_icnode_hdr node1hdr; + struct xfs_da3_icnode_hdr node2hdr; + + node1 = node1_bp->b_addr; + node2 = node2_bp->b_addr; + xfs_da3_node_hdr_from_disk(&node1hdr, node1); + xfs_da3_node_hdr_from_disk(&node2hdr, node2); + btree1 = xfs_da3_node_tree_p(node1); + btree2 = xfs_da3_node_tree_p(node2); + + if (node1hdr.count > 0 && node2hdr.count > 0 && + ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || + (be32_to_cpu(btree2[node2hdr.count - 1].hashval) < + be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) { + return 1; + } + return 0; +} + +/* * Link a new block into a doubly linked list of blocks (of whatever type). */ int /* error */ -xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, - xfs_da_state_blk_t *new_blk) +xfs_da3_blk_link( + struct xfs_da_state *state, + struct xfs_da_state_blk *old_blk, + struct xfs_da_state_blk *new_blk) { - xfs_da_blkinfo_t *old_info, *new_info, *tmp_info; - xfs_da_args_t *args; - int before=0, error; - struct xfs_buf *bp; + struct xfs_da_blkinfo *old_info; + struct xfs_da_blkinfo *new_info; + struct xfs_da_blkinfo *tmp_info; + struct xfs_da_args *args; + struct xfs_buf *bp; + int before = 0; + int error; /* * Set up environment. @@ -1306,9 +1694,6 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC || old_blk->magic == XFS_DIR2_LEAFN_MAGIC || old_blk->magic == XFS_ATTR_LEAF_MAGIC); - ASSERT(old_blk->magic == be16_to_cpu(old_info->magic)); - ASSERT(new_blk->magic == be16_to_cpu(new_info->magic)); - ASSERT(old_blk->magic == new_blk->magic); switch (old_blk->magic) { case XFS_ATTR_LEAF_MAGIC: @@ -1318,7 +1703,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp); break; case XFS_DA_NODE_MAGIC: - before = xfs_da_node_order(old_blk->bp, new_blk->bp); + before = xfs_da3_node_order(old_blk->bp, new_blk->bp); break; } @@ -1333,14 +1718,14 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = cpu_to_be32(old_blk->blkno); new_info->back = old_info->back; if (old_info->back) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(old_info->back), -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); tmp_info = bp->b_addr; - ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic)); + ASSERT(tmp_info->magic == old_info->magic); ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno); tmp_info->forw = cpu_to_be32(new_blk->blkno); xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); @@ -1354,7 +1739,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = old_info->forw; new_info->back = cpu_to_be32(old_blk->blkno); if (old_info->forw) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(old_info->forw), -1, &bp, args->whichfork); if (error) @@ -1375,59 +1760,20 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, } /* - * Compare two intermediate nodes for "order". - */ -STATIC int -xfs_da_node_order( - struct xfs_buf *node1_bp, - struct xfs_buf *node2_bp) -{ - xfs_da_intnode_t *node1, *node2; - - node1 = node1_bp->b_addr; - node2 = node2_bp->b_addr; - ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) && - node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && - ((be32_to_cpu(node2->btree[0].hashval) < - be32_to_cpu(node1->btree[0].hashval)) || - (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) < - be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) { - return(1); - } - return(0); -} - -/* - * Pick up the last hashvalue from an intermediate node. - */ -STATIC uint -xfs_da_node_lasthash( - struct xfs_buf *bp, - int *count) -{ - xfs_da_intnode_t *node; - - node = bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - if (count) - *count = be16_to_cpu(node->hdr.count); - if (!node->hdr.count) - return(0); - return be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); -} - -/* * Unlink a block from a doubly linked list of blocks. */ STATIC int /* error */ -xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk) +xfs_da3_blk_unlink( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) { - xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info; - xfs_da_args_t *args; - struct xfs_buf *bp; - int error; + struct xfs_da_blkinfo *drop_info; + struct xfs_da_blkinfo *save_info; + struct xfs_da_blkinfo *tmp_info; + struct xfs_da_args *args; + struct xfs_buf *bp; + int error; /* * Set up environment. @@ -1439,8 +1785,6 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC || save_blk->magic == XFS_DIR2_LEAFN_MAGIC || save_blk->magic == XFS_ATTR_LEAF_MAGIC); - ASSERT(save_blk->magic == be16_to_cpu(save_info->magic)); - ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic)); ASSERT(save_blk->magic == drop_blk->magic); ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) || (be32_to_cpu(save_info->back) == drop_blk->blkno)); @@ -1454,7 +1798,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_back(args); save_info->back = drop_info->back; if (drop_info->back) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(drop_info->back), -1, &bp, args->whichfork); if (error) @@ -1471,7 +1815,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_forward(args); save_info->forw = drop_info->forw; if (drop_info->forw) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(drop_info->forw), -1, &bp, args->whichfork); if (error) @@ -1499,15 +1843,22 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * the new bottom and the root. */ int /* error */ -xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, - int forward, int release, int *result) +xfs_da3_path_shift( + struct xfs_da_state *state, + struct xfs_da_state_path *path, + int forward, + int release, + int *result) { - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *info; - xfs_da_intnode_t *node; - xfs_da_args_t *args; - xfs_dablk_t blkno=0; - int level, error; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *info; + struct xfs_da_intnode *node; + struct xfs_da_args *args; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + xfs_dablk_t blkno = 0; + int level; + int error; trace_xfs_da_path_shift(state->args); @@ -1522,16 +1873,17 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); level = (path->active-1) - 1; /* skip bottom layer in path */ for (blk = &path->blk[level]; level >= 0; blk--, level--) { - ASSERT(blk->bp != NULL); node = blk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) { + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + + if (forward && (blk->index < nodehdr.count - 1)) { blk->index++; - blkno = be32_to_cpu(node->btree[blk->index].before); + blkno = be32_to_cpu(btree[blk->index].before); break; } else if (!forward && (blk->index > 0)) { blk->index--; - blkno = be32_to_cpu(node->btree[blk->index].before); + blkno = be32_to_cpu(btree[blk->index].before); break; } } @@ -1557,45 +1909,60 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, * Read the next child block. */ blk->blkno = blkno; - error = xfs_da_node_read(args->trans, args->dp, blkno, -1, + error = xfs_da3_node_read(args->trans, args->dp, blkno, -1, &blk->bp, args->whichfork); if (error) return(error); - ASSERT(blk->bp != NULL); info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || - info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - blk->magic = be16_to_cpu(info->magic); - if (blk->magic == XFS_DA_NODE_MAGIC) { + info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) || + info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + + + /* + * Note: we flatten the magic number to a single type so we + * don't have to compare against crc/non-crc types elsewhere. + */ + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + blk->magic = XFS_DA_NODE_MAGIC; node = (xfs_da_intnode_t *)info; - blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); if (forward) blk->index = 0; else - blk->index = be16_to_cpu(node->hdr.count)-1; - blkno = be32_to_cpu(node->btree[blk->index].before); - } else { + blk->index = nodehdr.count - 1; + blkno = be32_to_cpu(btree[blk->index].before); + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + blk->magic = XFS_ATTR_LEAF_MAGIC; ASSERT(level == path->active-1); blk->index = 0; - switch(blk->magic) { - case XFS_ATTR_LEAF_MAGIC: - blk->hashval = xfs_attr_leaf_lasthash(blk->bp, - NULL); - break; - case XFS_DIR2_LEAFN_MAGIC: - blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, - NULL); - break; - default: - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC || - blk->magic == XFS_DIR2_LEAFN_MAGIC); - break; - } + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, + NULL); + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + blk->magic = XFS_DIR2_LEAFN_MAGIC; + ASSERT(level == path->active-1); + blk->index = 0; + blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, + NULL); + break; + default: + ASSERT(0); + break; } } *result = 0; - return(0); + return 0; } @@ -1782,22 +2149,36 @@ xfs_da_grow_inode( * a bmap btree split to do that. */ STATIC int -xfs_da_swap_lastblock( - xfs_da_args_t *args, - xfs_dablk_t *dead_blknop, - struct xfs_buf **dead_bufp) +xfs_da3_swap_lastblock( + struct xfs_da_args *args, + xfs_dablk_t *dead_blknop, + struct xfs_buf **dead_bufp) { - xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno; - struct xfs_buf *dead_buf, *last_buf, *sib_buf, *par_buf; - xfs_fileoff_t lastoff; - xfs_inode_t *ip; - xfs_trans_t *tp; - xfs_mount_t *mp; - int error, w, entno, level, dead_level; - xfs_da_blkinfo_t *dead_info, *sib_info; - xfs_da_intnode_t *par_node, *dead_node; - xfs_dir2_leaf_t *dead_leaf2; - xfs_dahash_t dead_hash; + struct xfs_da_blkinfo *dead_info; + struct xfs_da_blkinfo *sib_info; + struct xfs_da_intnode *par_node; + struct xfs_da_intnode *dead_node; + struct xfs_dir2_leaf *dead_leaf2; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr par_hdr; + struct xfs_inode *ip; + struct xfs_trans *tp; + struct xfs_mount *mp; + struct xfs_buf *dead_buf; + struct xfs_buf *last_buf; + struct xfs_buf *sib_buf; + struct xfs_buf *par_buf; + xfs_dahash_t dead_hash; + xfs_fileoff_t lastoff; + xfs_dablk_t dead_blkno; + xfs_dablk_t last_blkno; + xfs_dablk_t sib_blkno; + xfs_dablk_t par_blkno; + int error; + int w; + int entno; + int level; + int dead_level; trace_xfs_da_swap_lastblock(args); @@ -1821,7 +2202,7 @@ xfs_da_swap_lastblock( * Read the last block in the btree space. */ last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; - error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w); + error = xfs_da3_node_read(tp, ip, last_blkno, -1, &last_buf, w); if (error) return error; /* @@ -1833,22 +2214,31 @@ xfs_da_swap_lastblock( /* * Get values from the moved block. */ - if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) { + if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + dead_leaf2 = (xfs_dir2_leaf_t *)dead_info; + xfs_dir3_leaf_hdr_from_disk(&leafhdr, dead_leaf2); + ents = xfs_dir3_leaf_ents_p(dead_leaf2); dead_level = 0; - dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval); + dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval); } else { - ASSERT(dead_info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + struct xfs_da3_icnode_hdr deadhdr; + dead_node = (xfs_da_intnode_t *)dead_info; - dead_level = be16_to_cpu(dead_node->hdr.level); - dead_hash = be32_to_cpu(dead_node->btree[be16_to_cpu(dead_node->hdr.count) - 1].hashval); + xfs_da3_node_hdr_from_disk(&deadhdr, dead_node); + btree = xfs_da3_node_tree_p(dead_node); + dead_level = deadhdr.level; + dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval); } sib_buf = par_buf = NULL; /* * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); + error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -1870,7 +2260,7 @@ xfs_da_swap_lastblock( * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); + error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -1894,31 +2284,31 @@ xfs_da_swap_lastblock( * Walk down the tree looking for the parent of the moved block. */ for (;;) { - error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); + error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; - if (unlikely(par_node->hdr.info.magic != - cpu_to_be16(XFS_DA_NODE_MAGIC) || - (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) { + xfs_da3_node_hdr_from_disk(&par_hdr, par_node); + if (level >= 0 && level != par_hdr.level + 1) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } - level = be16_to_cpu(par_node->hdr.level); + level = par_hdr.level; + btree = xfs_da3_node_tree_p(par_node); for (entno = 0; - entno < be16_to_cpu(par_node->hdr.count) && - be32_to_cpu(par_node->btree[entno].hashval) < dead_hash; + entno < par_hdr.count && + be32_to_cpu(btree[entno].hashval) < dead_hash; entno++) continue; - if (unlikely(entno == be16_to_cpu(par_node->hdr.count))) { + if (entno == par_hdr.count) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } - par_blkno = be32_to_cpu(par_node->btree[entno].before); + par_blkno = be32_to_cpu(btree[entno].before); if (level == dead_level + 1) break; xfs_trans_brelse(tp, par_buf); @@ -1930,13 +2320,13 @@ xfs_da_swap_lastblock( */ for (;;) { for (; - entno < be16_to_cpu(par_node->hdr.count) && - be32_to_cpu(par_node->btree[entno].before) != last_blkno; + entno < par_hdr.count && + be32_to_cpu(btree[entno].before) != last_blkno; entno++) continue; - if (entno < be16_to_cpu(par_node->hdr.count)) + if (entno < par_hdr.count) break; - par_blkno = be32_to_cpu(par_node->hdr.info.forw); + par_blkno = par_hdr.forw; xfs_trans_brelse(tp, par_buf); par_buf = NULL; if (unlikely(par_blkno == 0)) { @@ -1945,27 +2335,27 @@ xfs_da_swap_lastblock( error = XFS_ERROR(EFSCORRUPTED); goto done; } - error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); + error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; - if (unlikely( - be16_to_cpu(par_node->hdr.level) != level || - par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC))) { + xfs_da3_node_hdr_from_disk(&par_hdr, par_node); + if (par_hdr.level != level) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } + btree = xfs_da3_node_tree_p(par_node); entno = 0; } /* * Update the parent entry pointing to the moved block. */ - par_node->btree[entno].before = cpu_to_be32(dead_blkno); + btree[entno].before = cpu_to_be32(dead_blkno); xfs_trans_log_buf(tp, par_buf, - XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before, - sizeof(par_node->btree[entno].before))); + XFS_DA_LOGRANGE(par_node, &btree[entno].before, + sizeof(btree[entno].before))); *dead_blknop = last_blkno; *dead_bufp = last_buf; return 0; @@ -2007,14 +2397,15 @@ xfs_da_shrink_inode( * Remove extents. If we get ENOSPC for a dir we have to move * the last block to the place we want to kill. */ - if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, - xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, - 0, args->firstblock, args->flist, - &done)) == ENOSPC) { + error = xfs_bunmapi(tp, dp, dead_blkno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, + 0, args->firstblock, args->flist, &done); + if (error == ENOSPC) { if (w != XFS_DATA_FORK) break; - if ((error = xfs_da_swap_lastblock(args, &dead_blkno, - &dead_buf))) + error = xfs_da3_swap_lastblock(args, &dead_blkno, + &dead_buf); + if (error) break; } else { break; @@ -2074,7 +2465,8 @@ xfs_buf_map_from_irec( ASSERT(nirecs >= 1); if (nirecs > 1) { - map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP); + map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), + KM_SLEEP | KM_NOFS); if (!map) return ENOMEM; *mapp = map; @@ -2130,7 +2522,8 @@ xfs_dabuf_map( * Optimize the one-block case. */ if (nfsb != 1) - irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP); + irecs = kmem_zalloc(sizeof(irec) * nfsb, + KM_SLEEP | KM_NOFS); nirecs = nfsb; error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, @@ -2279,12 +2672,21 @@ xfs_da_read_buf( magic1 = be32_to_cpu(hdr->magic); if (unlikely( XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) && + (magic != XFS_DA3_NODE_MAGIC) && (magic != XFS_ATTR_LEAF_MAGIC) && + (magic != XFS_ATTR3_LEAF_MAGIC) && (magic != XFS_DIR2_LEAF1_MAGIC) && + (magic != XFS_DIR3_LEAF1_MAGIC) && (magic != XFS_DIR2_LEAFN_MAGIC) && + (magic != XFS_DIR3_LEAFN_MAGIC) && (magic1 != XFS_DIR2_BLOCK_MAGIC) && + (magic1 != XFS_DIR3_BLOCK_MAGIC) && (magic1 != XFS_DIR2_DATA_MAGIC) && - (free->hdr.magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)), + (magic1 != XFS_DIR3_DATA_MAGIC) && + (free->hdr.magic != + cpu_to_be32(XFS_DIR2_FREE_MAGIC)) && + (free->hdr.magic != + cpu_to_be32(XFS_DIR3_FREE_MAGIC)), mp, XFS_ERRTAG_DA_READ_BUF, XFS_RANDOM_DA_READ_BUF))) { trace_xfs_da_btree_corrupt(bp, _RET_IP_); @@ -2342,41 +2744,3 @@ out_free: return -1; return mappedbno; } - -kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ - -/* - * Allocate a dir-state structure. - * We don't put them on the stack since they're large. - */ -xfs_da_state_t * -xfs_da_state_alloc(void) -{ - return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); -} - -/* - * Kill the altpath contents of a da-state structure. - */ -STATIC void -xfs_da_state_kill_altpath(xfs_da_state_t *state) -{ - int i; - - for (i = 0; i < state->altpath.active; i++) - state->altpath.blk[i].bp = NULL; - state->altpath.active = 0; -} - -/* - * Free a da-state structure. - */ -void -xfs_da_state_free(xfs_da_state_t *state) -{ - xfs_da_state_kill_altpath(state); -#ifdef DEBUG - memset((char *)state, 0, sizeof(*state)); -#endif /* DEBUG */ - kmem_zone_free(xfs_da_state_zone, state); -} diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index ee5170c46ae1..6fb3371c63cf 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -20,7 +21,6 @@ struct xfs_bmap_free; struct xfs_inode; -struct xfs_mount; struct xfs_trans; struct zone; @@ -47,6 +47,33 @@ typedef struct xfs_da_blkinfo { } xfs_da_blkinfo_t; /* + * CRC enabled directory structure types + * + * The headers change size for the additional verification information, but + * otherwise the tree layouts and contents are unchanged. Hence the da btree + * code can use the struct xfs_da_blkinfo for manipulating the tree links and + * magic numbers without modification for both v2 and v3 nodes. + */ +#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */ +#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */ +#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */ + +struct xfs_da3_blkinfo { + /* + * the node link manipulation code relies on the fact that the first + * element of this structure is the struct xfs_da_blkinfo so it can + * ignore the differences in the rest of the structures. + */ + struct xfs_da_blkinfo hdr; + __be32 crc; /* CRC of block */ + __be64 blkno; /* first block of the buffer */ + __be64 lsn; /* sequence number of last write */ + uuid_t uuid; /* filesystem we belong to */ + __be64 owner; /* inode that owns the block */ +}; + +/* * This is the structure of the root and intermediate nodes in the Btree. * The leaf nodes are defined above. * @@ -57,19 +84,76 @@ typedef struct xfs_da_blkinfo { */ #define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ +typedef struct xfs_da_node_hdr { + struct xfs_da_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ +} xfs_da_node_hdr_t; + +struct xfs_da3_node_hdr { + struct xfs_da3_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ + __be32 __pad32; +}; + +#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc)) + +typedef struct xfs_da_node_entry { + __be32 hashval; /* hash value for this descendant */ + __be32 before; /* Btree block before this key */ +} xfs_da_node_entry_t; + typedef struct xfs_da_intnode { - struct xfs_da_node_hdr { /* constant-structure header block */ - xfs_da_blkinfo_t info; /* block type, links, etc. */ - __be16 count; /* count of active entries */ - __be16 level; /* level above leaves (leaf == 0) */ - } hdr; - struct xfs_da_node_entry { - __be32 hashval; /* hash value for this descendant */ - __be32 before; /* Btree block before this key */ - } btree[1]; /* variable sized array of keys */ + struct xfs_da_node_hdr hdr; + struct xfs_da_node_entry __btree[]; } xfs_da_intnode_t; -typedef struct xfs_da_node_hdr xfs_da_node_hdr_t; -typedef struct xfs_da_node_entry xfs_da_node_entry_t; + +struct xfs_da3_intnode { + struct xfs_da3_node_hdr hdr; + struct xfs_da_node_entry __btree[]; +}; + +/* + * In-core version of the node header to abstract the differences in the v2 and + * v3 disk format of the headers. Callers need to convert to/from disk format as + * appropriate. + */ +struct xfs_da3_icnode_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t level; +}; + +extern void xfs_da3_node_hdr_from_disk(struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from); +extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from); + +static inline int +xfs_da3_node_hdr_size(struct xfs_da_intnode *dap) +{ + if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) + return sizeof(struct xfs_da3_node_hdr); + return sizeof(struct xfs_da_node_hdr); +} + +static inline struct xfs_da_node_entry * +xfs_da3_node_tree_p(struct xfs_da_intnode *dap) +{ + if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { + struct xfs_da3_intnode *dap3 = (struct xfs_da3_intnode *)dap; + return dap3->__btree; + } + return dap->__btree; +} + +extern void xfs_da3_intnode_from_disk(struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from); +extern void xfs_da3_intnode_to_disk(struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from); #define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize @@ -191,32 +275,34 @@ struct xfs_nameops { /* * Routines used for growing the Btree. */ -int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - struct xfs_buf **bpp, int whichfork); -int xfs_da_split(xfs_da_state_t *state); +int xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno, + int level, struct xfs_buf **bpp, int whichfork); +int xfs_da3_split(xfs_da_state_t *state); /* * Routines used for shrinking the Btree. */ -int xfs_da_join(xfs_da_state_t *state); -void xfs_da_fixhashpath(xfs_da_state_t *state, - xfs_da_state_path_t *path_to_to_fix); +int xfs_da3_join(xfs_da_state_t *state); +void xfs_da3_fixhashpath(struct xfs_da_state *state, + struct xfs_da_state_path *path_to_to_fix); /* * Routines used for finding things in the Btree. */ -int xfs_da_node_lookup_int(xfs_da_state_t *state, int *result); -int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, +int xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result); +int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, int forward, int release, int *result); /* * Utility routines. */ -int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, +int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_state_blk_t *new_blk); -int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp, +int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int which_fork); +extern const struct xfs_buf_ops xfs_da3_node_buf_ops; + /* * Utility routines. */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index f852b082a084..e36445ceaf80 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -24,6 +24,9 @@ #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -182,7 +185,7 @@ xfs_swap_extents_check_format( */ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { if (XFS_IFORK_BOFF(ip) && - tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) + XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip)) return EINVAL; if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) @@ -192,9 +195,8 @@ xfs_swap_extents_check_format( /* Reciprocal target->temp btree format checks */ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { if (XFS_IFORK_BOFF(tip) && - ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) + XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) return EINVAL; - if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return EINVAL; @@ -219,6 +221,14 @@ xfs_swap_extents( int taforkblks = 0; __uint64_t tmp; + /* + * We have no way of updating owner information in the BMBT blocks for + * each inode on CRC enabled filesystems, so to avoid corrupting the + * this metadata we simply don't allow extent swaps to occur. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return XFS_ERROR(EINVAL); + tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = XFS_ERROR(ENOMEM); diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index 1d9643b3dce6..f7a0e95d197a 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -19,7 +19,7 @@ #define __XFS_DINODE_H__ #define XFS_DINODE_MAGIC 0x494e /* 'IN' */ -#define XFS_DINODE_GOOD_VERSION(v) (((v) == 1 || (v) == 2)) +#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3) typedef struct xfs_timestamp { __be32 t_sec; /* timestamp seconds */ @@ -70,11 +70,36 @@ typedef struct xfs_dinode { /* di_next_unlinked is the only non-core field in the old dinode */ __be32 di_next_unlinked;/* agi unlinked list ptr */ -} __attribute__((packed)) xfs_dinode_t; + + /* start of the extended dinode, writable fields */ + __le32 di_crc; /* CRC of the inode */ + __be64 di_changecount; /* number of attribute changes */ + __be64 di_lsn; /* flush sequence */ + __be64 di_flags2; /* more random flags */ + __u8 di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_timestamp_t di_crtime; /* time created */ + __be64 di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ +} xfs_dinode_t; #define DI_MAX_FLUSH 0xffff /* + * Size of the core inode on disk. Version 1 and 2 inodes have + * the same size, but version 3 has grown a few additional fields. + */ +static inline uint xfs_dinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_dinode); + return offsetof(struct xfs_dinode, di_crc); +} + +/* * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. * Since the pathconf interface is signed, we use 2^31 - 1 instead. * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. @@ -104,11 +129,11 @@ typedef enum xfs_dinode_fmt { /* * Inode size for given fs. */ -#define XFS_LITINO(mp) \ - ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode))) +#define XFS_LITINO(mp, version) \ + ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) -#define XFS_BROOT_SIZE_ADJ \ - (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t)) +#define XFS_BROOT_SIZE_ADJ(ip) \ + (XFS_BMBT_BLOCK_LEN((ip)->i_mount) - sizeof(xfs_bmdr_block_t)) /* * Inode data & attribute fork sizes, per inode. @@ -119,10 +144,10 @@ typedef enum xfs_dinode_fmt { #define XFS_DFORK_DSIZE(dip,mp) \ (XFS_DFORK_Q(dip) ? \ XFS_DFORK_BOFF(dip) : \ - XFS_LITINO(mp)) + XFS_LITINO(mp, (dip)->di_version)) #define XFS_DFORK_ASIZE(dip,mp) \ (XFS_DFORK_Q(dip) ? \ - XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : \ + XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ 0) #define XFS_DFORK_SIZE(dip,mp,w) \ ((w) == XFS_DATA_FORK ? \ @@ -133,7 +158,7 @@ typedef enum xfs_dinode_fmt { * Return pointers to the data or attribute forks. */ #define XFS_DFORK_DPTR(dip) \ - ((char *)(dip) + sizeof(struct xfs_dinode)) + ((char *)dip + xfs_dinode_size(dip->di_version)) #define XFS_DFORK_APTR(dip) \ (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) #define XFS_DFORK_PTR(dip,w) \ diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index b26a50f9921d..8f023dee404d 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -368,10 +368,8 @@ xfs_dir_removename( int xfs_readdir( xfs_inode_t *dp, - void *dirent, - size_t bufsize, - xfs_off_t *offset, - filldir_t filldir) + struct dir_context *ctx, + size_t bufsize) { int rval; /* return value */ int v; /* type-checking value */ @@ -385,14 +383,13 @@ xfs_readdir( XFS_STATS_INC(xs_dir_getdents); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir); + rval = xfs_dir2_sf_getdents(dp, ctx); else if ((rval = xfs_dir2_isblock(NULL, dp, &v))) ; else if (v) - rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir); + rval = xfs_dir2_block_getdents(dp, ctx); else - rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset, - filldir); + rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize); return rval; } diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 12afe07a91d7..09aea0247d96 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -28,11 +29,13 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" +#include "xfs_buf_item.h" #include "xfs_dir2.h" #include "xfs_dir2_format.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" /* * Local function prototypes. @@ -56,52 +59,110 @@ xfs_dir_startup(void) xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } -static void -xfs_dir2_block_verify( +static bool +xfs_dir3_block_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dir2_data_hdr *hdr = bp->b_addr; - int block_ok = 0; - - block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); - block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; - - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) + return false; } + if (__xfs_dir3_data_check(NULL, bp)) + return false; + return true; } static void -xfs_dir2_block_read_verify( +xfs_dir3_block_read_verify( struct xfs_buf *bp) { - xfs_dir2_block_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_DIR3_DATA_CRC_OFF)) || + !xfs_dir3_block_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void -xfs_dir2_block_write_verify( +xfs_dir3_block_write_verify( struct xfs_buf *bp) { - xfs_dir2_block_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_block_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); } -const struct xfs_buf_ops xfs_dir2_block_buf_ops = { - .verify_read = xfs_dir2_block_read_verify, - .verify_write = xfs_dir2_block_write_verify, +const struct xfs_buf_ops xfs_dir3_block_buf_ops = { + .verify_read = xfs_dir3_block_read_verify, + .verify_write = xfs_dir3_block_write_verify, }; static int -xfs_dir2_block_read( +xfs_dir3_block_read( struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; + int err; - return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, - XFS_DATA_FORK, &xfs_dir2_block_buf_ops); + err = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, + XFS_DATA_FORK, &xfs_dir3_block_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); + return err; +} + +static void +xfs_dir3_block_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *dp) +{ + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + bp->b_ops = &xfs_dir3_block_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + memset(hdr3, 0, sizeof(*hdr3)); + hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + return; + + } + hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); } static void @@ -121,7 +182,7 @@ xfs_dir2_block_need_space( struct xfs_dir2_data_unused *enddup = NULL; *compact = 0; - bf = hdr->bestfree; + bf = xfs_dir3_data_bestfree_p(hdr); /* * If there are stale entries we'll use one for the leaf. @@ -303,7 +364,7 @@ xfs_dir2_block_addname( mp = dp->i_mount; /* Read the (one and only) directory block into bp. */ - error = xfs_dir2_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, &bp); if (error) return error; @@ -498,7 +559,7 @@ xfs_dir2_block_addname( xfs_dir2_data_log_header(tp, bp); xfs_dir2_block_log_tail(tp, bp); xfs_dir2_data_log_entry(tp, bp, dep); - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); return 0; } @@ -508,9 +569,7 @@ xfs_dir2_block_addname( int /* error */ xfs_dir2_block_getdents( xfs_inode_t *dp, /* incore inode */ - void *dirent, - xfs_off_t *offset, - filldir_t filldir) + struct dir_context *ctx) { xfs_dir2_data_hdr_t *hdr; /* block header */ struct xfs_buf *bp; /* buffer for block */ @@ -528,10 +587,10 @@ xfs_dir2_block_getdents( /* * If the block number in the offset is out of range, we're done. */ - if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) + if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk) return 0; - error = xfs_dir2_block_read(NULL, dp, &bp); + error = xfs_dir3_block_read(NULL, dp, &bp); if (error) return error; @@ -539,14 +598,14 @@ xfs_dir2_block_getdents( * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. */ - wantoff = xfs_dir2_dataptr_to_off(mp, *offset); + wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos); hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); /* * Set up values for the loop. */ btp = xfs_dir2_block_tail_p(mp, hdr); - ptr = (char *)(hdr + 1); + ptr = (char *)xfs_dir3_data_entry_p(hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); /* @@ -578,13 +637,12 @@ xfs_dir2_block_getdents( cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, (char *)dep - (char *)hdr); + ctx->pos = cook & 0x7fffffff; /* * If it didn't fit, set the final offset to here & return. */ - if (filldir(dirent, (char *)dep->name, dep->namelen, - cook & 0x7fffffff, be64_to_cpu(dep->inumber), - DT_UNKNOWN)) { - *offset = cook & 0x7fffffff; + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, + be64_to_cpu(dep->inumber), DT_UNKNOWN)) { xfs_trans_brelse(NULL, bp); return 0; } @@ -594,7 +652,7 @@ xfs_dir2_block_getdents( * Reached the end of the block. * Set the offset to a non-existent block 1 and return. */ - *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & + ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & 0x7fffffff; xfs_trans_brelse(NULL, bp); return 0; @@ -665,7 +723,7 @@ xfs_dir2_block_lookup( dp = args->dp; mp = dp->i_mount; hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -711,12 +769,12 @@ xfs_dir2_block_lookup_int( tp = args->trans; mp = dp->i_mount; - error = xfs_dir2_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, &bp); if (error) return error; hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -853,7 +911,7 @@ xfs_dir2_block_removename( xfs_dir2_data_freescan(mp, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(tp, bp); - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); /* * See if the size as a shortform is good enough. */ @@ -910,7 +968,7 @@ xfs_dir2_block_replace( */ dep->inumber = cpu_to_be64(args->inumber); xfs_dir2_data_log_entry(args->trans, bp, dep); - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); return 0; } @@ -958,6 +1016,8 @@ xfs_dir2_leaf_to_block( __be16 *tagp; /* end of entry (tag) */ int to; /* block/leaf to index */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_to_block(args); @@ -965,8 +1025,12 @@ xfs_dir2_leaf_to_block( tp = args->trans; mp = dp->i_mount; leaf = lbp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); ltp = xfs_dir2_leaf_tail_p(mp, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC || + leafhdr.magic == XFS_DIR3_LEAF1_MAGIC); /* * If there are data blocks other than the first one, take this * opportunity to remove trailing empty data blocks that may have @@ -974,9 +1038,12 @@ xfs_dir2_leaf_to_block( * These will show up in the leaf bests table. */ while (dp->i_d.di_size > mp->m_dirblksize) { + int hdrsz; + + hdrsz = xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb)); bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == - mp->m_dirblksize - (uint)sizeof(*hdr)) { + mp->m_dirblksize - hdrsz) { if ((error = xfs_dir2_leaf_trim_data(args, lbp, (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) @@ -988,17 +1055,19 @@ xfs_dir2_leaf_to_block( * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { - error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp); + error = xfs_dir3_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp); if (error) return error; } hdr = dbp->b_addr; - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + /* * Size of the "leaf" area in the block. */ size = (uint)sizeof(xfs_dir2_block_tail_t) + - (uint)sizeof(*lep) * (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); + (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale); /* * Look at the last data entry. */ @@ -1014,8 +1083,8 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ - dbp->b_ops = &xfs_dir2_block_buf_ops; - hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + xfs_dir3_block_init(mp, tp, dbp, dp); + needlog = 1; needscan = 0; /* @@ -1027,18 +1096,17 @@ xfs_dir2_leaf_to_block( * Initialize the block tail. */ btp = xfs_dir2_block_tail_p(mp, hdr); - btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); + btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale); btp->stale = 0; xfs_dir2_block_log_tail(tp, dbp); /* * Initialize the block leaf area. We compact out stale entries. */ lep = xfs_dir2_block_leaf_p(btp); - for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { - if (leaf->ents[from].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + for (from = to = 0; from < leafhdr.count; from++) { + if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; - lep[to++] = leaf->ents[from]; + lep[to++] = ents[from]; } ASSERT(to == be32_to_cpu(btp->count)); xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1); @@ -1137,16 +1205,16 @@ xfs_dir2_sf_to_block( return error; } /* - * Initialize the data block. + * Initialize the data block, then convert it to block format. */ - error = xfs_dir2_data_init(args, blkno, &bp); + error = xfs_dir3_data_init(args, blkno, &bp); if (error) { kmem_free(sfp); return error; } - bp->b_ops = &xfs_dir2_block_buf_ops; + xfs_dir3_block_init(mp, tp, bp, dp); hdr = bp->b_addr; - hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + /* * Compute size of block "tail" area. */ @@ -1156,7 +1224,7 @@ xfs_dir2_sf_to_block( * The whole thing is initialized to free by the init routine. * Say we're using the leaf and tail area. */ - dup = (xfs_dir2_data_unused_t *)(hdr + 1); + dup = xfs_dir3_data_unused_p(hdr); needlog = needscan = 0; xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog, &needscan); @@ -1178,8 +1246,7 @@ xfs_dir2_sf_to_block( /* * Create entry for . */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + XFS_DIR2_DATA_DOT_OFFSET); + dep = xfs_dir3_data_dot_entry_p(hdr); dep->inumber = cpu_to_be64(dp->i_ino); dep->namelen = 1; dep->name[0] = '.'; @@ -1192,8 +1259,7 @@ xfs_dir2_sf_to_block( /* * Create entry for .. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + XFS_DIR2_DATA_DOTDOT_OFFSET); + dep = xfs_dir3_data_dotdot_entry_p(hdr); dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; @@ -1203,7 +1269,7 @@ xfs_dir2_sf_to_block( blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, (char *)dep - (char *)hdr)); - offset = XFS_DIR2_DATA_FIRST_OFFSET; + offset = xfs_dir3_data_first_offset(hdr); /* * Loop over existing entries, stuff them in. */ @@ -1273,6 +1339,6 @@ xfs_dir2_sf_to_block( ASSERT(needscan == 0); xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1); xfs_dir2_block_log_tail(tp, bp); - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); return 0; } diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index ffcf1774152e..c2930238005c 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -30,6 +31,8 @@ #include "xfs_dir2_format.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" STATIC xfs_dir2_data_free_t * xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); @@ -40,7 +43,7 @@ xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); * Return 0 is the buffer is good, otherwise an error. */ int -__xfs_dir2_data_check( +__xfs_dir3_data_check( struct xfs_inode *dp, /* incore inode pointer */ struct xfs_buf *bp) /* data block's buffer */ { @@ -65,15 +68,17 @@ __xfs_dir2_data_check( mp = bp->b_target->bt_mount; hdr = bp->b_addr; - bf = hdr->bestfree; - p = (char *)(hdr + 1); + bf = xfs_dir3_data_bestfree_p(hdr); + p = (char *)xfs_dir3_data_entry_p(hdr); switch (hdr->magic) { + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): btp = xfs_dir2_block_tail_p(mp, hdr); lep = xfs_dir2_block_leaf_p(btp); endp = (char *)lep; break; + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): case cpu_to_be32(XFS_DIR2_DATA_MAGIC): endp = (char *)hdr + mp->m_dirblksize; break; @@ -148,7 +153,8 @@ __xfs_dir2_data_check( (char *)dep - (char *)hdr); count++; lastfree = 0; - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, (xfs_dir2_data_aoff_t) ((char *)dep - (char *)hdr)); @@ -168,7 +174,8 @@ __xfs_dir2_data_check( * Need to have seen all the entries and all the bestfree slots. */ XFS_WANT_CORRUPTED_RETURN(freeseen == 7); - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { if (lep[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) @@ -185,21 +192,27 @@ __xfs_dir2_data_check( return 0; } -static void -xfs_dir2_data_verify( +static bool +xfs_dir3_data_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dir2_data_hdr *hdr = bp->b_addr; - int block_ok = 0; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC); - block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; - - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) + return false; } + if (__xfs_dir3_data_check(NULL, bp)) + return false; + return true; } /* @@ -208,7 +221,7 @@ xfs_dir2_data_verify( * format buffer or a data format buffer on readahead. */ static void -xfs_dir2_data_reada_verify( +xfs_dir3_data_reada_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -216,11 +229,13 @@ xfs_dir2_data_reada_verify( switch (hdr->magic) { case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): - bp->b_ops = &xfs_dir2_block_buf_ops; + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + bp->b_ops = &xfs_dir3_block_buf_ops; bp->b_ops->verify_read(bp); return; case cpu_to_be32(XFS_DIR2_DATA_MAGIC): - xfs_dir2_data_verify(bp); + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + xfs_dir3_data_verify(bp); return; default: XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); @@ -230,51 +245,80 @@ xfs_dir2_data_reada_verify( } static void -xfs_dir2_data_read_verify( +xfs_dir3_data_read_verify( struct xfs_buf *bp) { - xfs_dir2_data_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_DIR3_DATA_CRC_OFF)) || + !xfs_dir3_data_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void -xfs_dir2_data_write_verify( +xfs_dir3_data_write_verify( struct xfs_buf *bp) { - xfs_dir2_data_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_data_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); } -const struct xfs_buf_ops xfs_dir2_data_buf_ops = { - .verify_read = xfs_dir2_data_read_verify, - .verify_write = xfs_dir2_data_write_verify, +const struct xfs_buf_ops xfs_dir3_data_buf_ops = { + .verify_read = xfs_dir3_data_read_verify, + .verify_write = xfs_dir3_data_write_verify, }; -static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = { - .verify_read = xfs_dir2_data_reada_verify, - .verify_write = xfs_dir2_data_write_verify, +static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { + .verify_read = xfs_dir3_data_reada_verify, + .verify_write = xfs_dir3_data_write_verify, }; int -xfs_dir2_data_read( +xfs_dir3_data_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, - XFS_DATA_FORK, &xfs_dir2_data_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, + XFS_DATA_FORK, &xfs_dir3_data_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); + return err; } int -xfs_dir2_data_readahead( +xfs_dir3_data_readahead( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno) { return xfs_da_reada_buf(tp, dp, bno, mapped_bno, - XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops); + XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops); } /* @@ -288,12 +332,15 @@ xfs_dir2_data_freefind( { xfs_dir2_data_free_t *dfp; /* bestfree entry */ xfs_dir2_data_aoff_t off; /* offset value needed */ + struct xfs_dir2_data_free *bf; #if defined(DEBUG) && defined(__KERNEL__) int matched; /* matched the value */ int seenzero; /* saw a 0 bestfree entry */ #endif off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); + bf = xfs_dir3_data_bestfree_p(hdr); + #if defined(DEBUG) && defined(__KERNEL__) /* * Validate some consistency in the bestfree table. @@ -301,9 +348,11 @@ xfs_dir2_data_freefind( * one we're looking for it has to be exact. */ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); - for (dfp = &hdr->bestfree[0], seenzero = matched = 0; - dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT]; + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + for (dfp = &bf[0], seenzero = matched = 0; + dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { if (!dfp->offset) { ASSERT(!dfp->length); @@ -319,7 +368,7 @@ xfs_dir2_data_freefind( else ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off); ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length)); - if (dfp > &hdr->bestfree[0]) + if (dfp > &bf[0]) ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length)); } #endif @@ -328,14 +377,12 @@ xfs_dir2_data_freefind( * it can't be there since they're sorted. */ if (be16_to_cpu(dup->length) < - be16_to_cpu(hdr->bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length)) + be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length)) return NULL; /* * Look at the three bestfree entries for our guy. */ - for (dfp = &hdr->bestfree[0]; - dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT]; - dfp++) { + for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { if (!dfp->offset) return NULL; if (be16_to_cpu(dfp->offset) == off) @@ -359,11 +406,12 @@ xfs_dir2_data_freeinsert( xfs_dir2_data_free_t *dfp; /* bestfree table pointer */ xfs_dir2_data_free_t new; /* new bestfree entry */ -#ifdef __KERNEL__ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); -#endif - dfp = hdr->bestfree; + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + + dfp = xfs_dir3_data_bestfree_p(hdr); new.length = dup->length; new.offset = cpu_to_be16((char *)dup - (char *)hdr); @@ -400,32 +448,36 @@ xfs_dir2_data_freeremove( xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */ int *loghead) /* out: log data header */ { -#ifdef __KERNEL__ + struct xfs_dir2_data_free *bf; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); -#endif + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + /* * It's the first entry, slide the next 2 up. */ - if (dfp == &hdr->bestfree[0]) { - hdr->bestfree[0] = hdr->bestfree[1]; - hdr->bestfree[1] = hdr->bestfree[2]; + bf = xfs_dir3_data_bestfree_p(hdr); + if (dfp == &bf[0]) { + bf[0] = bf[1]; + bf[1] = bf[2]; } /* * It's the second entry, slide the 3rd entry up. */ - else if (dfp == &hdr->bestfree[1]) - hdr->bestfree[1] = hdr->bestfree[2]; + else if (dfp == &bf[1]) + bf[1] = bf[2]; /* * Must be the last entry. */ else - ASSERT(dfp == &hdr->bestfree[2]); + ASSERT(dfp == &bf[2]); /* * Clear the 3rd entry, must be zero now. */ - hdr->bestfree[2].length = 0; - hdr->bestfree[2].offset = 0; + bf[2].length = 0; + bf[2].offset = 0; *loghead = 1; } @@ -441,23 +493,27 @@ xfs_dir2_data_freescan( xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* active data entry */ xfs_dir2_data_unused_t *dup; /* unused data entry */ + struct xfs_dir2_data_free *bf; char *endp; /* end of block's data */ char *p; /* current entry pointer */ -#ifdef __KERNEL__ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); -#endif + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + /* * Start by clearing the table. */ - memset(hdr->bestfree, 0, sizeof(hdr->bestfree)); + bf = xfs_dir3_data_bestfree_p(hdr); + memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT); *loghead = 1; /* * Set up pointers. */ - p = (char *)(hdr + 1); - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + p = (char *)xfs_dir3_data_entry_p(hdr); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { btp = xfs_dir2_block_tail_p(mp, hdr); endp = (char *)xfs_dir2_block_leaf_p(btp); } else @@ -493,7 +549,7 @@ xfs_dir2_data_freescan( * Give back the buffer for the created block. */ int /* error */ -xfs_dir2_data_init( +xfs_dir3_data_init( xfs_da_args_t *args, /* directory operation args */ xfs_dir2_db_t blkno, /* logical dir block number */ struct xfs_buf **bpp) /* output block buffer */ @@ -502,6 +558,7 @@ xfs_dir2_data_init( xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* unused entry pointer */ + struct xfs_dir2_data_free *bf; int error; /* error return value */ int i; /* bestfree index */ xfs_mount_t *mp; /* filesystem mount point */ @@ -518,27 +575,40 @@ xfs_dir2_data_init( XFS_DATA_FORK); if (error) return error; - bp->b_ops = &xfs_dir2_data_buf_ops; + bp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF); /* * Initialize the header. */ hdr = bp->b_addr; - hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); - hdr->bestfree[0].offset = cpu_to_be16(sizeof(*hdr)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + memset(hdr3, 0, sizeof(*hdr3)); + hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + + } else + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + + bf = xfs_dir3_data_bestfree_p(hdr); + bf[0].offset = cpu_to_be16(xfs_dir3_data_entry_offset(hdr)); for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { - hdr->bestfree[i].length = 0; - hdr->bestfree[i].offset = 0; + bf[i].length = 0; + bf[i].offset = 0; } /* * Set up an unused entry for the block's body. */ - dup = (xfs_dir2_data_unused_t *)(hdr + 1); + dup = xfs_dir3_data_unused_p(hdr); dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); - t = mp->m_dirblksize - (uint)sizeof(*hdr); - hdr->bestfree[0].length = cpu_to_be16(t); + t = mp->m_dirblksize - (uint)xfs_dir3_data_entry_offset(hdr); + bf[0].length = cpu_to_be16(t); dup->length = cpu_to_be16(t); *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr); /* @@ -562,7 +632,9 @@ xfs_dir2_data_log_entry( xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) - @@ -580,9 +652,11 @@ xfs_dir2_data_log_header( xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); - xfs_trans_log_buf(tp, bp, 0, sizeof(*hdr) - 1); + xfs_trans_log_buf(tp, bp, 0, xfs_dir3_data_entry_offset(hdr) - 1); } /* @@ -597,7 +671,9 @@ xfs_dir2_data_log_unused( xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); /* * Log the first part of the unused entry. @@ -635,6 +711,7 @@ xfs_dir2_data_make_free( xfs_dir2_data_unused_t *newdup; /* new unused entry */ xfs_dir2_data_unused_t *postdup; /* unused entry after us */ xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ + struct xfs_dir2_data_free *bf; mp = tp->t_mountp; hdr = bp->b_addr; @@ -642,12 +719,14 @@ xfs_dir2_data_make_free( /* * Figure out where the end of the data area is. */ - if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)) + if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)) endptr = (char *)hdr + mp->m_dirblksize; else { xfs_dir2_block_tail_t *btp; /* block tail */ - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); btp = xfs_dir2_block_tail_p(mp, hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); } @@ -655,7 +734,7 @@ xfs_dir2_data_make_free( * If this isn't the start of the block, then back up to * the previous entry and see if it's free. */ - if (offset > sizeof(*hdr)) { + if (offset > xfs_dir3_data_entry_offset(hdr)) { __be16 *tagp; /* tag just before us */ tagp = (__be16 *)((char *)hdr + offset) - 1; @@ -681,6 +760,7 @@ xfs_dir2_data_make_free( * Previous and following entries are both free, * merge everything into a single free entry. */ + bf = xfs_dir3_data_bestfree_p(hdr); if (prevdup && postdup) { xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */ @@ -695,7 +775,7 @@ xfs_dir2_data_make_free( * since the third bestfree is there, there might be more * entries. */ - needscan = (hdr->bestfree[2].length != 0); + needscan = (bf[2].length != 0); /* * Fix up the new big freespace. */ @@ -711,10 +791,10 @@ xfs_dir2_data_make_free( * Remove entry 1 first then entry 0. */ ASSERT(dfp && dfp2); - if (dfp == &hdr->bestfree[1]) { - dfp = &hdr->bestfree[0]; + if (dfp == &bf[1]) { + dfp = &bf[0]; ASSERT(dfp2 == dfp); - dfp2 = &hdr->bestfree[1]; + dfp2 = &bf[1]; } xfs_dir2_data_freeremove(hdr, dfp2, needlogp); xfs_dir2_data_freeremove(hdr, dfp, needlogp); @@ -722,7 +802,7 @@ xfs_dir2_data_make_free( * Now insert the new entry. */ dfp = xfs_dir2_data_freeinsert(hdr, prevdup, needlogp); - ASSERT(dfp == &hdr->bestfree[0]); + ASSERT(dfp == &bf[0]); ASSERT(dfp->length == prevdup->length); ASSERT(!dfp[1].length); ASSERT(!dfp[2].length); @@ -751,7 +831,7 @@ xfs_dir2_data_make_free( */ else { needscan = be16_to_cpu(prevdup->length) > - be16_to_cpu(hdr->bestfree[2].length); + be16_to_cpu(bf[2].length); } } /* @@ -779,7 +859,7 @@ xfs_dir2_data_make_free( */ else { needscan = be16_to_cpu(newdup->length) > - be16_to_cpu(hdr->bestfree[2].length); + be16_to_cpu(bf[2].length); } } /* @@ -818,10 +898,13 @@ xfs_dir2_data_use_free( xfs_dir2_data_unused_t *newdup; /* new unused entry */ xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ int oldlen; /* old unused entry's length */ + struct xfs_dir2_data_free *bf; hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); ASSERT(offset >= (char *)dup - (char *)hdr); ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr); @@ -831,7 +914,8 @@ xfs_dir2_data_use_free( */ dfp = xfs_dir2_data_freefind(hdr, dup); oldlen = be16_to_cpu(dup->length); - ASSERT(dfp || oldlen <= be16_to_cpu(hdr->bestfree[2].length)); + bf = xfs_dir3_data_bestfree_p(hdr); + ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length)); /* * Check for alignment with front and back of the entry. */ @@ -845,7 +929,7 @@ xfs_dir2_data_use_free( */ if (matchfront && matchback) { if (dfp) { - needscan = (hdr->bestfree[2].offset != 0); + needscan = (bf[2].offset != 0); if (!needscan) xfs_dir2_data_freeremove(hdr, dfp, needlogp); } @@ -875,7 +959,7 @@ xfs_dir2_data_use_free( * that means we don't know if there was a better * choice for the last slot, or not. Rescan. */ - needscan = dfp == &hdr->bestfree[2]; + needscan = dfp == &bf[2]; } } /* @@ -902,7 +986,7 @@ xfs_dir2_data_use_free( * that means we don't know if there was a better * choice for the last slot, or not. Rescan. */ - needscan = dfp == &hdr->bestfree[2]; + needscan = dfp == &bf[2]; } } /* @@ -930,7 +1014,7 @@ xfs_dir2_data_use_free( * the 2 new will work. */ if (dfp) { - needscan = (hdr->bestfree[2].length != 0); + needscan = (bf[2].length != 0); if (!needscan) { xfs_dir2_data_freeremove(hdr, dfp, needlogp); xfs_dir2_data_freeinsert(hdr, newdup, needlogp); diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h index 07270981f48f..7826782b8d78 100644 --- a/fs/xfs/xfs_dir2_format.h +++ b/fs/xfs/xfs_dir2_format.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -36,6 +37,38 @@ #define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F: free index blocks */ /* + * Directory Version 3 With CRCs. + * + * The tree formats are the same as for version 2 directories. The difference + * is in the block header and dirent formats. In many cases the v3 structures + * use v2 definitions as they are no different and this makes code sharing much + * easier. + * + * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the + * format is v2 then they switch to the existing v2 code, or the format is v3 + * they implement the v3 functionality. This means the existing dir2 is a mix of + * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called + * where there is a difference in the formats, otherwise the code is unchanged. + * + * Where it is possible, the code decides what to do based on the magic numbers + * in the blocks rather than feature bits in the superblock. This means the code + * is as independent of the external XFS code as possible as doesn't require + * passing struct xfs_mount pointers into places where it isn't really + * necessary. + * + * Version 3 includes: + * + * - a larger block header for CRC and identification purposes and so the + * offsets of all the structures inside the blocks are different. + * + * - new magic numbers to be able to detect the v2/v3 types on the fly. + */ + +#define XFS_DIR3_BLOCK_MAGIC 0x58444233 /* XDB3: single block dirs */ +#define XFS_DIR3_DATA_MAGIC 0x58444433 /* XDD3: multiblock dirs */ +#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */ + +/* * Byte offset in data block and shortform entry. */ typedef __uint16_t xfs_dir2_data_off_t; @@ -195,16 +228,6 @@ xfs_dir2_sf_nextentry(struct xfs_dir2_sf_hdr *hdr, xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET) /* - * Offsets of . and .. in data space (always block 0) - */ -#define XFS_DIR2_DATA_DOT_OFFSET \ - ((xfs_dir2_data_aoff_t)sizeof(struct xfs_dir2_data_hdr)) -#define XFS_DIR2_DATA_DOTDOT_OFFSET \ - (XFS_DIR2_DATA_DOT_OFFSET + xfs_dir2_data_entsize(1)) -#define XFS_DIR2_DATA_FIRST_OFFSET \ - (XFS_DIR2_DATA_DOTDOT_OFFSET + xfs_dir2_data_entsize(2)) - -/* * Describe a free area in the data block. * * The freespace will be formatted as a xfs_dir2_data_unused_t. @@ -226,6 +249,40 @@ typedef struct xfs_dir2_data_hdr { } xfs_dir2_data_hdr_t; /* + * define a structure for all the verification fields we are adding to the + * directory block structures. This will be used in several structures. + * The magic number must be the first entry to align with all the dir2 + * structures so we determine how to decode them just by the magic number. + */ +struct xfs_dir3_blk_hdr { + __be32 magic; /* magic number */ + __be32 crc; /* CRC of block */ + __be64 blkno; /* first block of the buffer */ + __be64 lsn; /* sequence number of last write */ + uuid_t uuid; /* filesystem we belong to */ + __be64 owner; /* inode that owns the block */ +}; + +struct xfs_dir3_data_hdr { + struct xfs_dir3_blk_hdr hdr; + xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT]; + __be32 pad; /* 64 bit alignment */ +}; + +#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc) + +static inline struct xfs_dir2_data_free * +xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) +{ + if (hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { + struct xfs_dir3_data_hdr *hdr3 = (struct xfs_dir3_data_hdr *)hdr; + return hdr3->best_free; + } + return hdr->bestfree; +} + +/* * Active entry in a data block. * * Aligned to 8 bytes. After the variable length name field there is a @@ -280,6 +337,94 @@ xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup) be16_to_cpu(dup->length) - sizeof(__be16)); } +static inline size_t +xfs_dir3_data_hdr_size(bool dir3) +{ + if (dir3) + return sizeof(struct xfs_dir3_data_hdr); + return sizeof(struct xfs_dir2_data_hdr); +} + +static inline size_t +xfs_dir3_data_entry_offset(struct xfs_dir2_data_hdr *hdr) +{ + bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + return xfs_dir3_data_hdr_size(dir3); +} + +static inline struct xfs_dir2_data_entry * +xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + xfs_dir3_data_entry_offset(hdr)); +} + +static inline struct xfs_dir2_data_unused * +xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_unused *) + ((char *)hdr + xfs_dir3_data_entry_offset(hdr)); +} + +/* + * Offsets of . and .. in data space (always block 0) + * + * The macros are used for shortform directories as they have no headers to read + * the magic number out of. Shortform directories need to know the size of the + * data block header because the sfe embeds the block offset of the entry into + * it so that it doesn't change when format conversion occurs. Bad Things Happen + * if we don't follow this rule. + */ +#define XFS_DIR3_DATA_DOT_OFFSET(mp) \ + xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb)) +#define XFS_DIR3_DATA_DOTDOT_OFFSET(mp) \ + (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir2_data_entsize(1)) +#define XFS_DIR3_DATA_FIRST_OFFSET(mp) \ + (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir2_data_entsize(2)) + +static inline xfs_dir2_data_aoff_t +xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr) +{ + return xfs_dir3_data_entry_offset(hdr); +} + +static inline xfs_dir2_data_aoff_t +xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr) +{ + return xfs_dir3_data_dot_offset(hdr) + xfs_dir2_data_entsize(1); +} + +static inline xfs_dir2_data_aoff_t +xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr) +{ + return xfs_dir3_data_dotdot_offset(hdr) + xfs_dir2_data_entsize(2); +} + +/* + * location of . and .. in data space (always block 0) + */ +static inline struct xfs_dir2_data_entry * +xfs_dir3_data_dot_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + xfs_dir3_data_dot_offset(hdr)); +} + +static inline struct xfs_dir2_data_entry * +xfs_dir3_data_dotdot_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + xfs_dir3_data_dotdot_offset(hdr)); +} + +static inline struct xfs_dir2_data_entry * +xfs_dir3_data_first_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + xfs_dir3_data_first_offset(hdr)); +} + /* * Leaf block structures. * @@ -329,6 +474,21 @@ typedef struct xfs_dir2_leaf_hdr { __be16 stale; /* count of stale entries */ } xfs_dir2_leaf_hdr_t; +struct xfs_dir3_leaf_hdr { + struct xfs_da3_blkinfo info; /* header for da routines */ + __be16 count; /* count of entries */ + __be16 stale; /* count of stale entries */ + __be32 pad; /* 64 bit alignment */ +}; + +struct xfs_dir3_icleaf_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t stale; +}; + /* * Leaf block entry. */ @@ -348,23 +508,50 @@ typedef struct xfs_dir2_leaf_tail { * Leaf block. */ typedef struct xfs_dir2_leaf { - xfs_dir2_leaf_hdr_t hdr; /* leaf header */ - xfs_dir2_leaf_entry_t ents[]; /* entries */ + xfs_dir2_leaf_hdr_t hdr; /* leaf header */ + xfs_dir2_leaf_entry_t __ents[]; /* entries */ } xfs_dir2_leaf_t; -/* - * DB blocks here are logical directory block numbers, not filesystem blocks. - */ +struct xfs_dir3_leaf { + struct xfs_dir3_leaf_hdr hdr; /* leaf header */ + struct xfs_dir2_leaf_entry __ents[]; /* entries */ +}; -static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp) +#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc) + +static inline int +xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp) { - return (mp->m_dirblksize - (uint)sizeof(struct xfs_dir2_leaf_hdr)) / + if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) + return sizeof(struct xfs_dir3_leaf_hdr); + return sizeof(struct xfs_dir2_leaf_hdr); +} + +static inline int +xfs_dir3_max_leaf_ents(struct xfs_mount *mp, struct xfs_dir2_leaf *lp) +{ + return (mp->m_dirblksize - xfs_dir3_leaf_hdr_size(lp)) / (uint)sizeof(struct xfs_dir2_leaf_entry); } /* * Get address of the bestcount field in the single-leaf block. */ +static inline struct xfs_dir2_leaf_entry * +xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp) +{ + if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_dir3_leaf *lp3 = (struct xfs_dir3_leaf *)lp; + return lp3->__ents; + } + return lp->__ents; +} + +/* + * Get address of the bestcount field in the single-leaf block. + */ static inline struct xfs_dir2_leaf_tail * xfs_dir2_leaf_tail_p(struct xfs_mount *mp, struct xfs_dir2_leaf *lp) { @@ -383,6 +570,10 @@ xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp) } /* + * DB blocks here are logical directory block numbers, not filesystem blocks. + */ + +/* * Convert dataptr to byte in file space */ static inline xfs_dir2_off_t @@ -520,19 +711,66 @@ typedef struct xfs_dir2_free { /* unused entries are -1 */ } xfs_dir2_free_t; -static inline int xfs_dir2_free_max_bests(struct xfs_mount *mp) +struct xfs_dir3_free_hdr { + struct xfs_dir3_blk_hdr hdr; + __be32 firstdb; /* db of first entry */ + __be32 nvalid; /* count of valid entries */ + __be32 nused; /* count of used entries */ + __be32 pad; /* 64 bit alignment */ +}; + +struct xfs_dir3_free { + struct xfs_dir3_free_hdr hdr; + __be16 bests[]; /* best free counts */ + /* unused entries are -1 */ +}; + +#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc) + +/* + * In core version of the free block header, abstracted away from on-disk format + * differences. Use this in the code, and convert to/from the disk version using + * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk. + */ +struct xfs_dir3_icfree_hdr { + __uint32_t magic; + __uint32_t firstdb; + __uint32_t nvalid; + __uint32_t nused; + +}; + +void xfs_dir3_free_hdr_from_disk(struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from); + +static inline int +xfs_dir3_free_hdr_size(struct xfs_mount *mp) { - return (mp->m_dirblksize - sizeof(struct xfs_dir2_free_hdr)) / + if (xfs_sb_version_hascrc(&mp->m_sb)) + return sizeof(struct xfs_dir3_free_hdr); + return sizeof(struct xfs_dir2_free_hdr); +} + +static inline int +xfs_dir3_free_max_bests(struct xfs_mount *mp) +{ + return (mp->m_dirblksize - xfs_dir3_free_hdr_size(mp)) / sizeof(xfs_dir2_data_off_t); } +static inline __be16 * +xfs_dir3_free_bests_p(struct xfs_mount *mp, struct xfs_dir2_free *free) +{ + return (__be16 *)((char *)free + xfs_dir3_free_hdr_size(mp)); +} + /* * Convert data space db to the corresponding free db. */ static inline xfs_dir2_db_t xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) { - return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir2_free_max_bests(mp); + return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir3_free_max_bests(mp); } /* @@ -541,7 +779,7 @@ xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) static inline int xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) { - return db % xfs_dir2_free_max_bests(mp); + return db % xfs_dir3_free_max_bests(mp); } /* diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 60cd2fa4e047..2aed25cae04d 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -33,97 +34,371 @@ #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" /* * Local function declarations. */ -#ifdef DEBUG -static void xfs_dir2_leaf_check(struct xfs_inode *dp, struct xfs_buf *bp); -#else -#define xfs_dir2_leaf_check(dp, bp) -#endif static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp, int *indexp, struct xfs_buf **dbpp); -static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, +static void xfs_dir3_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); -static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); +static void xfs_dir3_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); -static void -xfs_dir2_leaf_verify( +/* + * Check the internal consistency of a leaf1 block. + * Pop an assert if something is wrong. + */ +#ifdef DEBUG +#define xfs_dir3_leaf_check(mp, bp) \ +do { \ + if (!xfs_dir3_leaf1_check((mp), (bp))) \ + ASSERT(0); \ +} while (0); + +STATIC bool +xfs_dir3_leaf1_check( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + + if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) + return false; + + return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf); +} +#else +#define xfs_dir3_leaf_check(mp, bp) +#endif + +void +xfs_dir3_leaf_hdr_from_disk( + struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from) +{ + if (from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) { + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->stale = be16_to_cpu(from->hdr.stale); + } else { + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->count); + to->stale = be16_to_cpu(hdr3->stale); + } + + ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC || + to->magic == XFS_DIR3_LEAF1_MAGIC || + to->magic == XFS_DIR2_LEAFN_MAGIC || + to->magic == XFS_DIR3_LEAFN_MAGIC); +} + +void +xfs_dir3_leaf_hdr_to_disk( + struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC || + from->magic == XFS_DIR3_LEAF1_MAGIC || + from->magic == XFS_DIR2_LEAFN_MAGIC || + from->magic == XFS_DIR3_LEAFN_MAGIC); + + if (from->magic == XFS_DIR2_LEAF1_MAGIC || + from->magic == XFS_DIR2_LEAFN_MAGIC) { + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.stale = cpu_to_be16(from->stale); + } else { + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to; + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->count = cpu_to_be16(from->count); + hdr3->stale = cpu_to_be16(from->stale); + } +} + +bool +xfs_dir3_leaf_check_int( + struct xfs_mount *mp, + struct xfs_dir3_icleaf_hdr *hdr, + struct xfs_dir2_leaf *leaf) +{ + struct xfs_dir2_leaf_entry *ents; + xfs_dir2_leaf_tail_t *ltp; + int stale; + int i; + + ents = xfs_dir3_leaf_ents_p(leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + + /* + * XXX (dgc): This value is not restrictive enough. + * Should factor in the size of the bests table as well. + * We can deduce a value for that from di_size. + */ + if (hdr->count > xfs_dir3_max_leaf_ents(mp, leaf)) + return false; + + /* Leaves and bests don't overlap in leaf format. */ + if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR3_LEAF1_MAGIC) && + (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) + return false; + + /* Check hash value order, count stale entries. */ + for (i = stale = 0; i < hdr->count; i++) { + if (i + 1 < hdr->count) { + if (be32_to_cpu(ents[i].hashval) > + be32_to_cpu(ents[i + 1].hashval)) + return false; + } + if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + stale++; + } + if (hdr->stale != stale) + return false; + return true; +} + +static bool +xfs_dir3_leaf_verify( struct xfs_buf *bp, - __be16 magic) + __uint16_t magic) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dir2_leaf_hdr *hdr = bp->b_addr; - int block_ok = 0; + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); + + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + + if ((magic == XFS_DIR2_LEAF1_MAGIC && + leafhdr.magic != XFS_DIR3_LEAF1_MAGIC) || + (magic == XFS_DIR2_LEAFN_MAGIC && + leafhdr.magic != XFS_DIR3_LEAFN_MAGIC)) + return false; - block_ok = hdr->info.magic == magic; - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else { + if (leafhdr.magic != magic) + return false; + } + return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf); +} + +static void +__read_verify( + struct xfs_buf *bp, + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_DIR3_LEAF_CRC_OFF)) || + !xfs_dir3_leaf_verify(bp, magic)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +__write_verify( + struct xfs_buf *bp, + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_leaf_verify(bp, magic)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + return; } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_LEAF_CRC_OFF); } static void -xfs_dir2_leaf1_read_verify( +xfs_dir3_leaf1_read_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + __read_verify(bp, XFS_DIR2_LEAF1_MAGIC); } static void -xfs_dir2_leaf1_write_verify( +xfs_dir3_leaf1_write_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + __write_verify(bp, XFS_DIR2_LEAF1_MAGIC); } -void -xfs_dir2_leafn_read_verify( +static void +xfs_dir3_leafn_read_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + __read_verify(bp, XFS_DIR2_LEAFN_MAGIC); } -void -xfs_dir2_leafn_write_verify( +static void +xfs_dir3_leafn_write_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + __write_verify(bp, XFS_DIR2_LEAFN_MAGIC); } -static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = { - .verify_read = xfs_dir2_leaf1_read_verify, - .verify_write = xfs_dir2_leaf1_write_verify, +const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { + .verify_read = xfs_dir3_leaf1_read_verify, + .verify_write = xfs_dir3_leaf1_write_verify, }; -const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = { - .verify_read = xfs_dir2_leafn_read_verify, - .verify_write = xfs_dir2_leafn_write_verify, +const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { + .verify_read = xfs_dir3_leafn_read_verify, + .verify_write = xfs_dir3_leafn_write_verify, }; static int -xfs_dir2_leaf_read( +xfs_dir3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); + return err; } int -xfs_dir2_leafn_read( +xfs_dir3_leafn_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); + return err; +} + +/* + * Initialize a new leaf block, leaf1 or leafn magic accepted. + */ +static void +xfs_dir3_leaf_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *bp, + xfs_ino_t owner, + __uint16_t type) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + + ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + + memset(leaf3, 0, sizeof(*leaf3)); + + leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC) + ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) + : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); + leaf3->info.blkno = cpu_to_be64(bp->b_bn); + leaf3->info.owner = cpu_to_be64(owner); + uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid); + } else { + memset(leaf, 0, sizeof(*leaf)); + leaf->hdr.info.magic = cpu_to_be16(type); + } + + /* + * If it's a leaf-format directory initialize the tail. + * Caller is responsible for initialising the bests table. + */ + if (type == XFS_DIR2_LEAF1_MAGIC) { + struct xfs_dir2_leaf_tail *ltp; + + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp->bestcount = 0; + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF); + } else { + bp->b_ops = &xfs_dir3_leafn_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); + } +} + +int +xfs_dir3_leaf_get_buf( + xfs_da_args_t *args, + xfs_dir2_db_t bno, + struct xfs_buf **bpp, + __uint16_t magic) +{ + struct xfs_inode *dp = args->dp; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + + ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); + ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) && + bno < XFS_DIR2_FREE_FIRSTDB(mp)); + + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, + XFS_DATA_FORK); + if (error) + return error; + + xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic); + xfs_dir3_leaf_log_header(tp, bp); + if (magic == XFS_DIR2_LEAF1_MAGIC) + xfs_dir3_leaf_log_tail(tp, bp); + *bpp = bp; + return 0; } /* @@ -149,6 +424,9 @@ xfs_dir2_block_to_leaf( int needlog; /* need to log block header */ int needscan; /* need to rescan bestfree */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_block_to_leaf(args); @@ -168,26 +446,33 @@ xfs_dir2_block_to_leaf( /* * Initialize the leaf block, get a buffer for it. */ - if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) { + error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC); + if (error) return error; - } - ASSERT(lbp != NULL); + leaf = lbp->b_addr; hdr = dbp->b_addr; - xfs_dir2_data_check(dp, dbp); + xfs_dir3_data_check(dp, dbp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); + bf = xfs_dir3_data_bestfree_p(hdr); + ents = xfs_dir3_leaf_ents_p(leaf); + /* * Set the counts in the leaf header. */ - leaf->hdr.count = cpu_to_be16(be32_to_cpu(btp->count)); - leaf->hdr.stale = cpu_to_be16(be32_to_cpu(btp->stale)); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + leafhdr.count = be32_to_cpu(btp->count); + leafhdr.stale = be32_to_cpu(btp->stale); + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, lbp); + /* * Could compact these but I think we always do the conversion * after squeezing out stale entries. */ - memcpy(leaf->ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, lbp, 0, be16_to_cpu(leaf->hdr.count) - 1); + memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(tp, lbp, 0, leafhdr.count - 1); needscan = 0; needlog = 1; /* @@ -202,8 +487,13 @@ xfs_dir2_block_to_leaf( /* * Fix up the block header, make it a data block. */ - dbp->b_ops = &xfs_dir2_data_buf_ops; - hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + dbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + else + hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + if (needscan) xfs_dir2_data_freescan(mp, hdr, &needlog); /* @@ -212,21 +502,22 @@ xfs_dir2_block_to_leaf( ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = cpu_to_be32(1); bestsp = xfs_dir2_leaf_bests_p(ltp); - bestsp[0] = hdr->bestfree[0].length; + bestsp[0] = bf[0].length; /* * Log the data header and leaf bests table. */ if (needlog) xfs_dir2_data_log_header(tp, dbp); - xfs_dir2_leaf_check(dp, lbp); - xfs_dir2_data_check(dp, dbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, 0); + xfs_dir3_leaf_check(mp, lbp); + xfs_dir3_data_check(dp, dbp); + xfs_dir3_leaf_log_bests(tp, lbp, 0, 0); return 0; } STATIC void -xfs_dir2_leaf_find_stale( - struct xfs_dir2_leaf *leaf, +xfs_dir3_leaf_find_stale( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int index, int *lowstale, int *highstale) @@ -235,7 +526,7 @@ xfs_dir2_leaf_find_stale( * Find the first stale entry before our index, if any. */ for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) { - if (leaf->ents[*lowstale].address == + if (ents[*lowstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) break; } @@ -245,10 +536,8 @@ xfs_dir2_leaf_find_stale( * Stop if the result would require moving more entries than using * lowstale. */ - for (*highstale = index; - *highstale < be16_to_cpu(leaf->hdr.count); - ++*highstale) { - if (leaf->ents[*highstale].address == + for (*highstale = index; *highstale < leafhdr->count; ++*highstale) { + if (ents[*highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) break; if (*lowstale >= 0 && index - *lowstale <= *highstale - index) @@ -257,8 +546,9 @@ xfs_dir2_leaf_find_stale( } struct xfs_dir2_leaf_entry * -xfs_dir2_leaf_find_entry( - xfs_dir2_leaf_t *leaf, /* leaf structure */ +xfs_dir3_leaf_find_entry( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int index, /* leaf table position */ int compact, /* need to compact leaves */ int lowstale, /* index of prev stale leaf */ @@ -266,7 +556,7 @@ xfs_dir2_leaf_find_entry( int *lfloglow, /* low leaf logging index */ int *lfloghigh) /* high leaf logging index */ { - if (!leaf->hdr.stale) { + if (!leafhdr->stale) { xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ /* @@ -274,18 +564,16 @@ xfs_dir2_leaf_find_entry( * * If there are no stale entries, just insert a hole at index. */ - lep = &leaf->ents[index]; - if (index < be16_to_cpu(leaf->hdr.count)) + lep = &ents[index]; + if (index < leafhdr->count) memmove(lep + 1, lep, - (be16_to_cpu(leaf->hdr.count) - index) * - sizeof(*lep)); + (leafhdr->count - index) * sizeof(*lep)); /* * Record low and high logging indices for the leaf. */ *lfloglow = index; - *lfloghigh = be16_to_cpu(leaf->hdr.count); - be16_add_cpu(&leaf->hdr.count, 1); + *lfloghigh = leafhdr->count++; return lep; } @@ -299,16 +587,17 @@ xfs_dir2_leaf_find_entry( * entries before and after our insertion point. */ if (compact == 0) - xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale); + xfs_dir3_leaf_find_stale(leafhdr, ents, index, + &lowstale, &highstale); /* * If the low one is better, use it. */ if (lowstale >= 0 && - (highstale == be16_to_cpu(leaf->hdr.count) || + (highstale == leafhdr->count || index - lowstale - 1 < highstale - index)) { ASSERT(index - lowstale - 1 >= 0); - ASSERT(leaf->ents[lowstale].address == + ASSERT(ents[lowstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); /* @@ -316,37 +605,34 @@ xfs_dir2_leaf_find_entry( * for the new entry. */ if (index - lowstale - 1 > 0) { - memmove(&leaf->ents[lowstale], - &leaf->ents[lowstale + 1], + memmove(&ents[lowstale], &ents[lowstale + 1], (index - lowstale - 1) * - sizeof(xfs_dir2_leaf_entry_t)); + sizeof(xfs_dir2_leaf_entry_t)); } *lfloglow = MIN(lowstale, *lfloglow); *lfloghigh = MAX(index - 1, *lfloghigh); - be16_add_cpu(&leaf->hdr.stale, -1); - return &leaf->ents[index - 1]; + leafhdr->stale--; + return &ents[index - 1]; } /* * The high one is better, so use that one. */ ASSERT(highstale - index >= 0); - ASSERT(leaf->ents[highstale].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); + ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); /* * Copy entries down to cover the stale entry and make room for the * new entry. */ if (highstale - index > 0) { - memmove(&leaf->ents[index + 1], - &leaf->ents[index], + memmove(&ents[index + 1], &ents[index], (highstale - index) * sizeof(xfs_dir2_leaf_entry_t)); } *lfloglow = MIN(index, *lfloglow); *lfloghigh = MAX(highstale, *lfloghigh); - be16_add_cpu(&leaf->hdr.stale, -1); - return &leaf->ents[index]; + leafhdr->stale--; + return &ents[index]; } /* @@ -383,6 +669,9 @@ xfs_dir2_leaf_addname( __be16 *tagp; /* end of data entry */ xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t use_block; /* data block number */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_addname(args); @@ -390,7 +679,7 @@ xfs_dir2_leaf_addname( tp = args->trans; mp = dp->i_mount; - error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); if (error) return error; @@ -403,16 +692,19 @@ xfs_dir2_leaf_addname( index = xfs_dir2_leaf_search_hash(args, lbp); leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); length = xfs_dir2_data_entsize(args->namelen); + /* * See if there are any entries with the same hash value * and space in their block for the new entry. * This is good because it puts multiple same-hash value entries * in a data block, improving the lookup of those entries. */ - for (use_block = -1, lep = &leaf->ents[index]; - index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval; + for (use_block = -1, lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; index++, lep++) { if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) continue; @@ -445,7 +737,7 @@ xfs_dir2_leaf_addname( * How many bytes do we need in the leaf block? */ needbytes = 0; - if (!leaf->hdr.stale) + if (!leafhdr.stale) needbytes += sizeof(xfs_dir2_leaf_entry_t); if (use_block == -1) needbytes += sizeof(xfs_dir2_data_off_t); @@ -460,16 +752,15 @@ xfs_dir2_leaf_addname( * If we don't have enough free bytes but we can make enough * by compacting out stale entries, we'll do that. */ - if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < - needbytes && be16_to_cpu(leaf->hdr.stale) > 1) { + if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes && + leafhdr.stale > 1) compact = 1; - } + /* * Otherwise if we don't have enough free bytes we need to * convert to node form. */ - else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu( - leaf->hdr.count)] < needbytes) { + else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) { /* * Just checking or no space reservation, give up. */ @@ -517,15 +808,15 @@ xfs_dir2_leaf_addname( * point later. */ if (compact) { - xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale, - &lfloglow, &lfloghigh); + xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale, + &highstale, &lfloglow, &lfloghigh); } /* * There are stale entries, so we'll need log-low and log-high * impossibly bad values later. */ - else if (be16_to_cpu(leaf->hdr.stale)) { - lfloglow = be16_to_cpu(leaf->hdr.count); + else if (leafhdr.stale) { + lfloglow = leafhdr.count; lfloghigh = -1; } /* @@ -544,7 +835,7 @@ xfs_dir2_leaf_addname( /* * Initialize the block. */ - if ((error = xfs_dir2_data_init(args, use_block, &dbp))) { + if ((error = xfs_dir3_data_init(args, use_block, &dbp))) { xfs_trans_brelse(tp, lbp); return error; } @@ -557,23 +848,24 @@ xfs_dir2_leaf_addname( memmove(&bestsp[0], &bestsp[1], be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0])); be32_add_cpu(<p->bestcount, 1); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(tp, lbp); + xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); } /* * If we're filling in a previously empty block just log it. */ else - xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); + xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block); hdr = dbp->b_addr; - bestsp[use_block] = hdr->bestfree[0].length; + bf = xfs_dir3_data_bestfree_p(hdr); + bestsp[use_block] = bf[0].length; grown = 1; } else { /* * Already had space in some data block. * Just read that one in. */ - error = xfs_dir2_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, use_block), -1, &dbp); if (error) { @@ -581,13 +873,14 @@ xfs_dir2_leaf_addname( return error; } hdr = dbp->b_addr; + bf = xfs_dir3_data_bestfree_p(hdr); grown = 0; } /* * Point to the biggest freespace in our data block. */ dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset)); + ((char *)hdr + be16_to_cpu(bf[0].offset)); ASSERT(be16_to_cpu(dup->length) >= length); needscan = needlog = 0; /* @@ -620,13 +913,13 @@ xfs_dir2_leaf_addname( * If the bests table needs to be changed, do it. * Log the change unless we've already done that. */ - if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(hdr->bestfree[0].length)) { - bestsp[use_block] = hdr->bestfree[0].length; + if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) { + bestsp[use_block] = bf[0].length; if (!grown) - xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); + xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block); } - lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale, + lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, highstale, &lfloglow, &lfloghigh); /* @@ -638,82 +931,40 @@ xfs_dir2_leaf_addname( /* * Log the leaf fields and give up the buffers. */ - xfs_dir2_leaf_log_header(tp, lbp); - xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh); - xfs_dir2_leaf_check(dp, lbp); - xfs_dir2_data_check(dp, dbp); + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, lbp); + xfs_dir3_leaf_log_ents(tp, lbp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(mp, lbp); + xfs_dir3_data_check(dp, dbp); return 0; } -#ifdef DEBUG -/* - * Check the internal consistency of a leaf1 block. - * Pop an assert if something is wrong. - */ -STATIC void -xfs_dir2_leaf_check( - struct xfs_inode *dp, /* incore directory inode */ - struct xfs_buf *bp) /* leaf's buffer */ -{ - int i; /* leaf index */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ - xfs_mount_t *mp; /* filesystem mount point */ - int stale; /* count of stale leaves */ - - leaf = bp->b_addr; - mp = dp->i_mount; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); - /* - * This value is not restrictive enough. - * Should factor in the size of the bests table as well. - * We can deduce a value for that from di_size. - */ - ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); - ltp = xfs_dir2_leaf_tail_p(mp, leaf); - /* - * Leaves and bests don't overlap. - */ - ASSERT((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <= - (char *)xfs_dir2_leaf_bests_p(ltp)); - /* - * Check hash value order, count stale entries. - */ - for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { - if (i + 1 < be16_to_cpu(leaf->hdr.count)) - ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= - be32_to_cpu(leaf->ents[i + 1].hashval)); - if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) - stale++; - } - ASSERT(be16_to_cpu(leaf->hdr.stale) == stale); -} -#endif /* DEBUG */ - /* * Compact out any stale entries in the leaf. * Log the header and changed leaf entries, if any. */ void -xfs_dir2_leaf_compact( +xfs_dir3_leaf_compact( xfs_da_args_t *args, /* operation arguments */ + struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp) /* leaf buffer */ { int from; /* source leaf index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int loglow; /* first leaf entry to log */ int to; /* target leaf index */ + struct xfs_dir2_leaf_entry *ents; leaf = bp->b_addr; - if (!leaf->hdr.stale) { + if (!leafhdr->stale) return; - } + /* * Compress out the stale entries in place. */ - for (from = to = 0, loglow = -1; from < be16_to_cpu(leaf->hdr.count); from++) { - if (leaf->ents[from].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + ents = xfs_dir3_leaf_ents_p(leaf); + for (from = to = 0, loglow = -1; from < leafhdr->count; from++) { + if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; /* * Only actually copy the entries that are different. @@ -721,19 +972,21 @@ xfs_dir2_leaf_compact( if (from > to) { if (loglow == -1) loglow = to; - leaf->ents[to] = leaf->ents[from]; + ents[to] = ents[from]; } to++; } /* * Update and log the header, log the leaf entries. */ - ASSERT(be16_to_cpu(leaf->hdr.stale) == from - to); - be16_add_cpu(&leaf->hdr.count, -(be16_to_cpu(leaf->hdr.stale))); - leaf->hdr.stale = 0; - xfs_dir2_leaf_log_header(args->trans, bp); + ASSERT(leafhdr->stale == from - to); + leafhdr->count -= leafhdr->stale; + leafhdr->stale = 0; + + xfs_dir3_leaf_hdr_to_disk(leaf, leafhdr); + xfs_dir3_leaf_log_header(args->trans, bp); if (loglow != -1) - xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1); + xfs_dir3_leaf_log_ents(args->trans, bp, loglow, to - 1); } /* @@ -745,8 +998,9 @@ xfs_dir2_leaf_compact( * and leaf logging indices. */ void -xfs_dir2_leaf_compact_x1( - struct xfs_buf *bp, /* leaf buffer */ +xfs_dir3_leaf_compact_x1( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int *indexp, /* insertion index */ int *lowstalep, /* out: stale entry before us */ int *highstalep, /* out: stale entry after us */ @@ -757,22 +1011,20 @@ xfs_dir2_leaf_compact_x1( int highstale; /* stale entry at/after index */ int index; /* insertion index */ int keepstale; /* source index of kept stale */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ int lowstale; /* stale entry before index */ int newindex=0; /* new insertion index */ int to; /* destination copy index */ - leaf = bp->b_addr; - ASSERT(be16_to_cpu(leaf->hdr.stale) > 1); + ASSERT(leafhdr->stale > 1); index = *indexp; - xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale); + xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale); /* * Pick the better of lowstale and highstale. */ if (lowstale >= 0 && - (highstale == be16_to_cpu(leaf->hdr.count) || + (highstale == leafhdr->count || index - lowstale <= highstale - index)) keepstale = lowstale; else @@ -781,15 +1033,14 @@ xfs_dir2_leaf_compact_x1( * Copy the entries in place, removing all the stale entries * except keepstale. */ - for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { + for (from = to = 0; from < leafhdr->count; from++) { /* * Notice the new value of index. */ if (index == from) newindex = to; if (from != keepstale && - leaf->ents[from].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { + ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { if (from == to) *lowlogp = to; continue; @@ -803,7 +1054,7 @@ xfs_dir2_leaf_compact_x1( * Copy only the entries that have moved. */ if (from > to) - leaf->ents[to] = leaf->ents[from]; + ents[to] = ents[from]; to++; } ASSERT(from > to); @@ -817,8 +1068,8 @@ xfs_dir2_leaf_compact_x1( /* * Adjust the leaf header values. */ - be16_add_cpu(&leaf->hdr.count, -(from - to)); - leaf->hdr.stale = cpu_to_be16(1); + leafhdr->count -= from - to; + leafhdr->stale = 1; /* * Remember the low/high stale value only in the "right" * direction. @@ -826,8 +1077,8 @@ xfs_dir2_leaf_compact_x1( if (lowstale >= newindex) lowstale = -1; else - highstale = be16_to_cpu(leaf->hdr.count); - *highlogp = be16_to_cpu(leaf->hdr.count) - 1; + highstale = leafhdr->count; + *highlogp = leafhdr->count - 1; *lowstalep = lowstale; *highstalep = highstale; } @@ -857,6 +1108,7 @@ xfs_dir2_leaf_readbuf( struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp = *bpp; struct xfs_bmbt_irec *map = mip->map; + struct blk_plug plug; int error = 0; int length; int i; @@ -965,7 +1217,7 @@ xfs_dir2_leaf_readbuf( * Read the directory block starting at the first mapping. */ mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); - error = xfs_dir2_data_read(NULL, dp, map->br_startoff, + error = xfs_dir3_data_read(NULL, dp, map->br_startoff, map->br_blockcount >= mp->m_dirblkfsbs ? XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp); @@ -985,6 +1237,7 @@ xfs_dir2_leaf_readbuf( /* * Do we need more readahead? */ + blk_start_plug(&plug); for (mip->ra_index = mip->ra_offset = i = 0; mip->ra_want > mip->ra_current && i < mip->map_blocks; i += mp->m_dirblkfsbs) { @@ -994,7 +1247,7 @@ xfs_dir2_leaf_readbuf( */ if (i > mip->ra_current && map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { - xfs_dir2_data_readahead(NULL, dp, + xfs_dir3_data_readahead(NULL, dp, map[mip->ra_index].br_startoff + mip->ra_offset, XFS_FSB_TO_DADDR(mp, map[mip->ra_index].br_startblock + @@ -1007,7 +1260,7 @@ xfs_dir2_leaf_readbuf( * use our mapping, but this is a very rare case. */ else if (i > mip->ra_current) { - xfs_dir2_data_readahead(NULL, dp, + xfs_dir3_data_readahead(NULL, dp, map[mip->ra_index].br_startoff + mip->ra_offset, -1); mip->ra_current = i; @@ -1036,6 +1289,7 @@ xfs_dir2_leaf_readbuf( } } } + blk_finish_plug(&plug); out: *bpp = bp; @@ -1049,10 +1303,8 @@ out: int /* error */ xfs_dir2_leaf_getdents( xfs_inode_t *dp, /* incore directory inode */ - void *dirent, - size_t bufsize, - xfs_off_t *offset, - filldir_t filldir) + struct dir_context *ctx, + size_t bufsize) { struct xfs_buf *bp = NULL; /* data block buffer */ xfs_dir2_data_hdr_t *hdr; /* data block header */ @@ -1071,7 +1323,7 @@ xfs_dir2_leaf_getdents( * If the offset is at or past the largest allowed value, * give up right away. */ - if (*offset >= XFS_DIR2_MAX_DATAPTR) + if (ctx->pos >= XFS_DIR2_MAX_DATAPTR) return 0; mp = dp->i_mount; @@ -1085,14 +1337,14 @@ xfs_dir2_leaf_getdents( mp->m_sb.sb_blocksize); map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + (length * sizeof(struct xfs_bmbt_irec)), - KM_SLEEP); + KM_SLEEP | KM_NOFS); map_info->map_size = length; /* * Inside the loop we keep the main offset value as a byte offset * in the directory file. */ - curoff = xfs_dir2_dataptr_to_byte(mp, *offset); + curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos); /* * Force this conversion through db so we truncate the offset @@ -1133,17 +1385,17 @@ xfs_dir2_leaf_getdents( ASSERT(xfs_dir2_byte_to_db(mp, curoff) == map_info->curdb); hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); /* * Find our position in the block. */ - ptr = (char *)(hdr + 1); + ptr = (char *)xfs_dir3_data_entry_p(hdr); byteoff = xfs_dir2_byte_to_off(mp, curoff); /* * Skip past the header. */ if (byteoff == 0) - curoff += (uint)sizeof(*hdr); + curoff += xfs_dir3_data_entry_offset(hdr); /* * Skip past entries until we reach our offset. */ @@ -1193,8 +1445,8 @@ xfs_dir2_leaf_getdents( dep = (xfs_dir2_data_entry_t *)ptr; length = xfs_dir2_data_entsize(dep->namelen); - if (filldir(dirent, (char *)dep->name, dep->namelen, - xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, + ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), DT_UNKNOWN)) break; @@ -1211,78 +1463,21 @@ xfs_dir2_leaf_getdents( * All done. Set output offset value to current offset. */ if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR)) - *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; + ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; else - *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; + ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; kmem_free(map_info); if (bp) xfs_trans_brelse(NULL, bp); return error; } -/* - * Initialize a new leaf block, leaf1 or leafn magic accepted. - */ -int -xfs_dir2_leaf_init( - xfs_da_args_t *args, /* operation arguments */ - xfs_dir2_db_t bno, /* directory block number */ - struct xfs_buf **bpp, /* out: leaf buffer */ - int magic) /* magic number for block */ -{ - struct xfs_buf *bp; /* leaf buffer */ - xfs_inode_t *dp; /* incore directory inode */ - int error; /* error return code */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ - xfs_trans_t *tp; /* transaction pointer */ - - dp = args->dp; - ASSERT(dp != NULL); - tp = args->trans; - mp = dp->i_mount; - ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) && - bno < XFS_DIR2_FREE_FIRSTDB(mp)); - /* - * Get the buffer for the block. - */ - error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, - XFS_DATA_FORK); - if (error) - return error; - - /* - * Initialize the header. - */ - leaf = bp->b_addr; - leaf->hdr.info.magic = cpu_to_be16(magic); - leaf->hdr.info.forw = 0; - leaf->hdr.info.back = 0; - leaf->hdr.count = 0; - leaf->hdr.stale = 0; - xfs_dir2_leaf_log_header(tp, bp); - /* - * If it's a leaf-format directory initialize the tail. - * In this case our caller has the real bests table to copy into - * the block. - */ - if (magic == XFS_DIR2_LEAF1_MAGIC) { - bp->b_ops = &xfs_dir2_leaf1_buf_ops; - ltp = xfs_dir2_leaf_tail_p(mp, leaf); - ltp->bestcount = 0; - xfs_dir2_leaf_log_tail(tp, bp); - } else - bp->b_ops = &xfs_dir2_leafn_buf_ops; - *bpp = bp; - return 0; -} /* * Log the bests entries indicated from a leaf1 block. */ static void -xfs_dir2_leaf_log_bests( +xfs_dir3_leaf_log_bests( xfs_trans_t *tp, /* transaction pointer */ struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ @@ -1290,11 +1485,12 @@ xfs_dir2_leaf_log_bests( { __be16 *firstb; /* pointer to first entry */ __be16 *lastb; /* pointer to last entry */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)); + ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf); firstb = xfs_dir2_leaf_bests_p(ltp) + first; lastb = xfs_dir2_leaf_bests_p(ltp) + last; @@ -1306,7 +1502,7 @@ xfs_dir2_leaf_log_bests( * Log the leaf entries indicated from a leaf1 or leafn block. */ void -xfs_dir2_leaf_log_ents( +xfs_dir3_leaf_log_ents( xfs_trans_t *tp, /* transaction pointer */ struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ @@ -1314,13 +1510,17 @@ xfs_dir2_leaf_log_ents( { xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */ xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *ents; - leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || - leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - firstlep = &leaf->ents[first]; - lastlep = &leaf->ents[last]; + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + + ents = xfs_dir3_leaf_ents_p(leaf); + firstlep = &ents[first]; + lastlep = &ents[last]; xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); } @@ -1329,34 +1529,38 @@ xfs_dir2_leaf_log_ents( * Log the header of the leaf1 or leafn block. */ void -xfs_dir2_leaf_log_header( +xfs_dir3_leaf_log_header( struct xfs_trans *tp, struct xfs_buf *bp) { - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; - leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || - leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), - (uint)(sizeof(leaf->hdr) - 1)); + xfs_dir3_leaf_hdr_size(leaf) - 1); } /* * Log the tail of the leaf1 block. */ STATIC void -xfs_dir2_leaf_log_tail( +xfs_dir3_leaf_log_tail( struct xfs_trans *tp, struct xfs_buf *bp) { - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ + struct xfs_mount *mp = tp->t_mountp; + + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); - mp = tp->t_mountp; - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); ltp = xfs_dir2_leaf_tail_p(mp, leaf); xfs_trans_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), (uint)(mp->m_dirblksize - 1)); @@ -1380,6 +1584,7 @@ xfs_dir2_leaf_lookup( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leaf_lookup(args); @@ -1391,12 +1596,14 @@ xfs_dir2_leaf_lookup( } tp = args->trans; dp = args->dp; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(dp->i_mount, lbp); leaf = lbp->b_addr; + ents = xfs_dir3_leaf_ents_p(leaf); /* * Get to the leaf entry and contained data entry address. */ - lep = &leaf->ents[index]; + lep = &ents[index]; + /* * Point to the data entry. */ @@ -1440,18 +1647,23 @@ xfs_dir2_leaf_lookup_int( xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t cidb = -1; /* case match data block no. */ enum xfs_dacmp cmp; /* name compare result */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; - error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); if (error) return error; *lbpp = lbp; leaf = lbp->b_addr; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(mp, lbp); + ents = xfs_dir3_leaf_ents_p(leaf); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + /* * Look for the first leaf entry with our hash value. */ @@ -1460,9 +1672,9 @@ xfs_dir2_leaf_lookup_int( * Loop over all the entries with the right hash value * looking to match the name. */ - for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip over stale leaf entries. */ @@ -1479,7 +1691,7 @@ xfs_dir2_leaf_lookup_int( if (newdb != curdb) { if (dbp) xfs_trans_brelse(tp, dbp); - error = xfs_dir2_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, newdb), -1, &dbp); if (error) { @@ -1520,7 +1732,7 @@ xfs_dir2_leaf_lookup_int( ASSERT(cidb != -1); if (cidb != curdb) { xfs_trans_brelse(tp, dbp); - error = xfs_dir2_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, cidb), -1, &dbp); if (error) { @@ -1566,6 +1778,9 @@ xfs_dir2_leaf_removename( int needscan; /* need to rescan data frees */ xfs_dir2_data_off_t oldbest; /* old value of best free */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_removename(args); @@ -1580,16 +1795,19 @@ xfs_dir2_leaf_removename( mp = dp->i_mount; leaf = lbp->b_addr; hdr = dbp->b_addr; - xfs_dir2_data_check(dp, dbp); + xfs_dir3_data_check(dp, dbp); + bf = xfs_dir3_data_bestfree_p(hdr); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); /* * Point to the leaf entry, use that to point to the data entry. */ - lep = &leaf->ents[index]; + lep = &ents[index]; db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); dep = (xfs_dir2_data_entry_t *) ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); needscan = needlog = 0; - oldbest = be16_to_cpu(hdr->bestfree[0].length); + oldbest = be16_to_cpu(bf[0].length); ltp = xfs_dir2_leaf_tail_p(mp, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); ASSERT(be16_to_cpu(bestsp[db]) == oldbest); @@ -1602,10 +1820,13 @@ xfs_dir2_leaf_removename( /* * We just mark the leaf entry stale by putting a null in it. */ - be16_add_cpu(&leaf->hdr.stale, 1); - xfs_dir2_leaf_log_header(tp, lbp); + leafhdr.stale++; + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, lbp); + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir2_leaf_log_ents(tp, lbp, index, index); + xfs_dir3_leaf_log_ents(tp, lbp, index, index); + /* * Scan the freespace in the data block again if necessary, * log the data block header if necessary. @@ -1618,16 +1839,16 @@ xfs_dir2_leaf_removename( * If the longest freespace in the data block has changed, * put the new value in the bests table and log that. */ - if (be16_to_cpu(hdr->bestfree[0].length) != oldbest) { - bestsp[db] = hdr->bestfree[0].length; - xfs_dir2_leaf_log_bests(tp, lbp, db, db); + if (be16_to_cpu(bf[0].length) != oldbest) { + bestsp[db] = bf[0].length; + xfs_dir3_leaf_log_bests(tp, lbp, db, db); } - xfs_dir2_data_check(dp, dbp); + xfs_dir3_data_check(dp, dbp); /* * If the data block is now empty then get rid of the data block. */ - if (be16_to_cpu(hdr->bestfree[0].length) == - mp->m_dirblksize - (uint)sizeof(*hdr)) { + if (be16_to_cpu(bf[0].length) == + mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr)) { ASSERT(db != mp->m_dirdatablk); if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { /* @@ -1638,7 +1859,7 @@ xfs_dir2_leaf_removename( */ if (error == ENOSPC && args->total == 0) error = 0; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(mp, lbp); return error; } dbp = NULL; @@ -1661,8 +1882,8 @@ xfs_dir2_leaf_removename( memmove(&bestsp[db - i], bestsp, (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp)); be32_add_cpu(<p->bestcount, -(db - i)); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(tp, lbp); + xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); } else bestsp[db] = cpu_to_be16(NULLDATAOFF); } @@ -1672,7 +1893,7 @@ xfs_dir2_leaf_removename( else if (db != mp->m_dirdatablk) dbp = NULL; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(mp, lbp); /* * See if we can convert to block form. */ @@ -1695,6 +1916,7 @@ xfs_dir2_leaf_replace( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leaf_replace(args); @@ -1706,10 +1928,11 @@ xfs_dir2_leaf_replace( } dp = args->dp; leaf = lbp->b_addr; + ents = xfs_dir3_leaf_ents_p(leaf); /* * Point to the leaf entry, get data address from it. */ - lep = &leaf->ents[index]; + lep = &ents[index]; /* * Point to the data entry. */ @@ -1723,7 +1946,7 @@ xfs_dir2_leaf_replace( dep->inumber = cpu_to_be64(args->inumber); tp = args->trans; xfs_dir2_data_log_entry(tp, dbp, dep); - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(dp->i_mount, lbp); xfs_trans_brelse(tp, lbp); return 0; } @@ -1745,17 +1968,22 @@ xfs_dir2_leaf_search_hash( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ int mid=0; /* current leaf index */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; leaf = lbp->b_addr; + ents = xfs_dir3_leaf_ents_p(leaf); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + #ifndef __KERNEL__ - if (!leaf->hdr.count) + if (!leafhdr.count) return 0; #endif /* * Note, the table cannot be empty, so we have to go through the loop. * Binary search the leaf entries looking for our hash value. */ - for (lep = leaf->ents, low = 0, high = be16_to_cpu(leaf->hdr.count) - 1, + for (lep = ents, low = 0, high = leafhdr.count - 1, hashwant = args->hashval; low <= high; ) { mid = (low + high) >> 1; @@ -1807,7 +2035,7 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp); + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp); if (error) return error; @@ -1817,10 +2045,12 @@ xfs_dir2_leaf_trim_data( #ifdef DEBUG { struct xfs_dir2_data_hdr *hdr = dbp->b_addr; + struct xfs_dir2_data_free *bf = xfs_dir3_data_bestfree_p(hdr); - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); - ASSERT(be16_to_cpu(hdr->bestfree[0].length) == - mp->m_dirblksize - (uint)sizeof(*hdr)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + ASSERT(be16_to_cpu(bf[0].length) == + mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr)); ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); } #endif @@ -1839,23 +2069,29 @@ xfs_dir2_leaf_trim_data( bestsp = xfs_dir2_leaf_bests_p(ltp); be32_add_cpu(<p->bestcount, -1); memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp)); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(tp, lbp); + xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); return 0; } static inline size_t -xfs_dir2_leaf_size( - struct xfs_dir2_leaf_hdr *hdr, +xfs_dir3_leaf_size( + struct xfs_dir3_icleaf_hdr *hdr, int counts) { - int entries; + int entries; + int hdrsize; + + entries = hdr->count - hdr->stale; + if (hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR2_LEAFN_MAGIC) + hdrsize = sizeof(struct xfs_dir2_leaf_hdr); + else + hdrsize = sizeof(struct xfs_dir3_leaf_hdr); - entries = be16_to_cpu(hdr->count) - be16_to_cpu(hdr->stale); - return sizeof(xfs_dir2_leaf_hdr_t) + - entries * sizeof(xfs_dir2_leaf_entry_t) + - counts * sizeof(xfs_dir2_data_off_t) + - sizeof(xfs_dir2_leaf_tail_t); + return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t) + + counts * sizeof(xfs_dir2_data_off_t) + + sizeof(xfs_dir2_leaf_tail_t); } /* @@ -1879,6 +2115,8 @@ xfs_dir2_node_to_leaf( xfs_mount_t *mp; /* filesystem mount point */ int rval; /* successful free trim? */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir3_icfree_hdr freehdr; /* * There's more than a leaf level in the btree, so there must @@ -1928,7 +2166,11 @@ xfs_dir2_node_to_leaf( return 0; lbp = state->path.blk[0].bp; leaf = lbp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + /* * Read the freespace block. */ @@ -1936,44 +2178,49 @@ xfs_dir2_node_to_leaf( if (error) return error; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - ASSERT(!free->hdr.firstdb); + xfs_dir3_free_hdr_from_disk(&freehdr, free); + + ASSERT(!freehdr.firstdb); /* * Now see if the leafn and free data will fit in a leaf1. * If not, release the buffer and give up. */ - if (xfs_dir2_leaf_size(&leaf->hdr, be32_to_cpu(free->hdr.nvalid)) > - mp->m_dirblksize) { + if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > mp->m_dirblksize) { xfs_trans_brelse(tp, fbp); return 0; } /* * If the leaf has any stale entries in it, compress them out. - * The compact routine will log the header. */ - if (be16_to_cpu(leaf->hdr.stale)) - xfs_dir2_leaf_compact(args, lbp); - else - xfs_dir2_leaf_log_header(tp, lbp); + if (leafhdr.stale) + xfs_dir3_leaf_compact(args, &leafhdr, lbp); - lbp->b_ops = &xfs_dir2_leaf1_buf_ops; - leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); + lbp->b_ops = &xfs_dir3_leaf1_buf_ops; + xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF); + leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC) + ? XFS_DIR2_LEAF1_MAGIC + : XFS_DIR3_LEAF1_MAGIC; /* * Set up the leaf tail from the freespace block. */ ltp = xfs_dir2_leaf_tail_p(mp, leaf); - ltp->bestcount = free->hdr.nvalid; + ltp->bestcount = cpu_to_be32(freehdr.nvalid); + /* * Set up the leaf bests table. */ - memcpy(xfs_dir2_leaf_bests_p(ltp), free->bests, - be32_to_cpu(ltp->bestcount) * sizeof(xfs_dir2_data_off_t)); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_check(dp, lbp); + memcpy(xfs_dir2_leaf_bests_p(ltp), xfs_dir3_free_bests_p(mp, free), + freehdr.nvalid * sizeof(xfs_dir2_data_off_t)); + + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, lbp); + xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(tp, lbp); + xfs_dir3_leaf_check(mp, lbp); + /* * Get rid of the freespace block. */ diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 5980f9b7fa9b..2226a00acd15 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -32,20 +33,14 @@ #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" /* * Function declarations. */ static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args, int index); -#ifdef DEBUG -static void xfs_dir2_leafn_check(struct xfs_inode *dp, struct xfs_buf *bp); -#else -#define xfs_dir2_leafn_check(dp, bp) -#endif -static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, struct xfs_buf *bp_s, - int start_s, struct xfs_buf *bp_d, - int start_d, int count); static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); @@ -55,52 +50,126 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, static int xfs_dir2_node_addname_int(xfs_da_args_t *args, xfs_da_state_blk_t *fblk); -static void -xfs_dir2_free_verify( +/* + * Check internal consistency of a leafn block. + */ +#ifdef DEBUG +#define xfs_dir3_leaf_check(mp, bp) \ +do { \ + if (!xfs_dir3_leafn_check((mp), (bp))) \ + ASSERT(0); \ +} while (0); + +static bool +xfs_dir3_leafn_check( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + + if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) + return false; + + return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf); +} +#else +#define xfs_dir3_leaf_check(mp, bp) +#endif + +static bool +xfs_dir3_free_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_free_hdr *hdr = bp->b_addr; - int block_ok = 0; - block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC); - if (!block_ok) { - XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic", - XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) + return false; } + + /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ + + return true; } static void -xfs_dir2_free_read_verify( +xfs_dir3_free_read_verify( struct xfs_buf *bp) { - xfs_dir2_free_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_DIR3_FREE_CRC_OFF)) || + !xfs_dir3_free_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void -xfs_dir2_free_write_verify( +xfs_dir3_free_write_verify( struct xfs_buf *bp) { - xfs_dir2_free_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_free_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_FREE_CRC_OFF); } -static const struct xfs_buf_ops xfs_dir2_free_buf_ops = { - .verify_read = xfs_dir2_free_read_verify, - .verify_write = xfs_dir2_free_write_verify, +const struct xfs_buf_ops xfs_dir3_free_buf_ops = { + .verify_read = xfs_dir3_free_read_verify, + .verify_write = xfs_dir3_free_write_verify, }; static int -__xfs_dir2_free_read( +__xfs_dir3_free_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir2_free_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_free_buf_ops); + + /* try read returns without an error or *bpp if it lands in a hole */ + if (!err && tp && *bpp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF); + return err; } int @@ -110,7 +179,7 @@ xfs_dir2_free_read( xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp); + return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp); } static int @@ -120,7 +189,96 @@ xfs_dir2_free_try_read( xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp); + return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp); +} + + +void +xfs_dir3_free_hdr_from_disk( + struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from) +{ + if (from->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)) { + to->magic = be32_to_cpu(from->hdr.magic); + to->firstdb = be32_to_cpu(from->hdr.firstdb); + to->nvalid = be32_to_cpu(from->hdr.nvalid); + to->nused = be32_to_cpu(from->hdr.nused); + } else { + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from; + + to->magic = be32_to_cpu(hdr3->hdr.magic); + to->firstdb = be32_to_cpu(hdr3->firstdb); + to->nvalid = be32_to_cpu(hdr3->nvalid); + to->nused = be32_to_cpu(hdr3->nused); + } + + ASSERT(to->magic == XFS_DIR2_FREE_MAGIC || + to->magic == XFS_DIR3_FREE_MAGIC); +} + +static void +xfs_dir3_free_hdr_to_disk( + struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_FREE_MAGIC || + from->magic == XFS_DIR3_FREE_MAGIC); + + if (from->magic == XFS_DIR2_FREE_MAGIC) { + to->hdr.magic = cpu_to_be32(from->magic); + to->hdr.firstdb = cpu_to_be32(from->firstdb); + to->hdr.nvalid = cpu_to_be32(from->nvalid); + to->hdr.nused = cpu_to_be32(from->nused); + } else { + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to; + + hdr3->hdr.magic = cpu_to_be32(from->magic); + hdr3->firstdb = cpu_to_be32(from->firstdb); + hdr3->nvalid = cpu_to_be32(from->nvalid); + hdr3->nused = cpu_to_be32(from->nused); + } +} + +static int +xfs_dir3_free_get_buf( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dir2_db_t fbno, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + struct xfs_dir3_icfree_hdr hdr; + + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno), + -1, &bp, XFS_DATA_FORK); + if (error) + return error; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF); + bp->b_ops = &xfs_dir3_free_buf_ops; + + /* + * Initialize the new block to be empty, and remember + * its first slot as our empty slot. + */ + memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr)); + memset(&hdr, 0, sizeof(hdr)); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; + + hdr.magic = XFS_DIR3_FREE_MAGIC; + + hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); + hdr3->hdr.owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); + } else + hdr.magic = XFS_DIR2_FREE_MAGIC; + xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr); + *bpp = bp; + return 0; } /* @@ -134,13 +292,16 @@ xfs_dir2_free_log_bests( int last) /* last entry to log */ { xfs_dir2_free_t *free; /* freespace structure */ + __be16 *bests; free = bp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); + bests = xfs_dir3_free_bests_p(tp->t_mountp, free); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); xfs_trans_log_buf(tp, bp, - (uint)((char *)&free->bests[first] - (char *)free), - (uint)((char *)&free->bests[last] - (char *)free + - sizeof(free->bests[0]) - 1)); + (uint)((char *)&bests[first] - (char *)free), + (uint)((char *)&bests[last] - (char *)free + + sizeof(bests[0]) - 1)); } /* @@ -154,9 +315,9 @@ xfs_dir2_free_log_header( xfs_dir2_free_t *free; /* freespace structure */ free = bp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - xfs_trans_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free), - (uint)(sizeof(xfs_dir2_free_hdr_t) - 1)); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); + xfs_trans_log_buf(tp, bp, 0, xfs_dir3_free_hdr_size(tp->t_mountp) - 1); } /* @@ -183,6 +344,7 @@ xfs_dir2_leaf_to_node( xfs_dir2_data_off_t off; /* freespace entry value */ __be16 *to; /* pointer to freespace entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icfree_hdr freehdr; trace_xfs_dir2_leaf_to_node(args); @@ -199,44 +361,53 @@ xfs_dir2_leaf_to_node( /* * Get the buffer for the new freespace block. */ - error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, - XFS_DATA_FORK); + error = xfs_dir3_free_get_buf(tp, dp, fdb, &fbp); if (error) return error; - fbp->b_ops = &xfs_dir2_free_buf_ops; free = fbp->b_addr; + xfs_dir3_free_hdr_from_disk(&freehdr, free); leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); - /* - * Initialize the freespace block header. - */ - free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); - free->hdr.firstdb = 0; - ASSERT(be32_to_cpu(ltp->bestcount) <= (uint)dp->i_d.di_size / mp->m_dirblksize); - free->hdr.nvalid = ltp->bestcount; + ASSERT(be32_to_cpu(ltp->bestcount) <= + (uint)dp->i_d.di_size / mp->m_dirblksize); + /* * Copy freespace entries from the leaf block to the new block. * Count active entries. */ - for (i = n = 0, from = xfs_dir2_leaf_bests_p(ltp), to = free->bests; - i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { + from = xfs_dir2_leaf_bests_p(ltp); + to = xfs_dir3_free_bests_p(mp, free); + for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { if ((off = be16_to_cpu(*from)) != NULLDATAOFF) n++; *to = cpu_to_be16(off); } - free->hdr.nused = cpu_to_be32(n); - - lbp->b_ops = &xfs_dir2_leafn_buf_ops; - leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); /* - * Log everything. + * Now initialize the freespace block header. */ - xfs_dir2_leaf_log_header(tp, lbp); + freehdr.nused = n; + freehdr.nvalid = be32_to_cpu(ltp->bestcount); + + xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr); + xfs_dir2_free_log_bests(tp, fbp, 0, freehdr.nvalid - 1); xfs_dir2_free_log_header(tp, fbp); - xfs_dir2_free_log_bests(tp, fbp, 0, be32_to_cpu(free->hdr.nvalid) - 1); - xfs_dir2_leafn_check(dp, lbp); + + /* + * Converting the leaf to a leafnode is just a matter of changing the + * magic number and the ops. Do the change directly to the buffer as + * it's less work (and less code) than decoding the header to host + * format and back again. + */ + if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)) + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); + else + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); + lbp->b_ops = &xfs_dir3_leafn_buf_ops; + xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF); + xfs_dir3_leaf_log_header(tp, lbp); + xfs_dir3_leaf_check(mp, lbp); return 0; } @@ -260,6 +431,8 @@ xfs_dir2_leafn_add( int lowstale; /* previous stale entry */ xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leafn_add(args, index); @@ -267,6 +440,8 @@ xfs_dir2_leafn_add( mp = dp->i_mount; tp = args->trans; leaf = bp->b_addr; + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); /* * Quick check just to make sure we are not going to index @@ -282,15 +457,15 @@ xfs_dir2_leafn_add( * a compact. */ - if (be16_to_cpu(leaf->hdr.count) == xfs_dir2_max_leaf_ents(mp)) { - if (!leaf->hdr.stale) + if (leafhdr.count == xfs_dir3_max_leaf_ents(mp, leaf)) { + if (!leafhdr.stale) return XFS_ERROR(ENOSPC); - compact = be16_to_cpu(leaf->hdr.stale) > 1; + compact = leafhdr.stale > 1; } else compact = 0; - ASSERT(index == 0 || be32_to_cpu(leaf->ents[index - 1].hashval) <= args->hashval); - ASSERT(index == be16_to_cpu(leaf->hdr.count) || - be32_to_cpu(leaf->ents[index].hashval) >= args->hashval); + ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval); + ASSERT(index == leafhdr.count || + be32_to_cpu(ents[index].hashval) >= args->hashval); if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; @@ -299,61 +474,51 @@ xfs_dir2_leafn_add( * Compact out all but one stale leaf entry. Leaves behind * the entry closest to index. */ - if (compact) { - xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale, - &lfloglow, &lfloghigh); - } - /* - * Set impossible logging indices for this case. - */ - else if (leaf->hdr.stale) { - lfloglow = be16_to_cpu(leaf->hdr.count); + if (compact) + xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale, + &highstale, &lfloglow, &lfloghigh); + else if (leafhdr.stale) { + /* + * Set impossible logging indices for this case. + */ + lfloglow = leafhdr.count; lfloghigh = -1; } /* * Insert the new entry, log everything. */ - lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale, + lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, highstale, &lfloglow, &lfloghigh); lep->hashval = cpu_to_be32(args->hashval); lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, args->blkno, args->index)); - xfs_dir2_leaf_log_header(tp, bp); - xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh); - xfs_dir2_leafn_check(dp, bp); + + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, bp); + xfs_dir3_leaf_log_ents(tp, bp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(mp, bp); return 0; } #ifdef DEBUG -/* - * Check internal consistency of a leafn block. - */ -void -xfs_dir2_leafn_check( - struct xfs_inode *dp, - struct xfs_buf *bp) +static void +xfs_dir2_free_hdr_check( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_dir2_db_t db) { - int i; /* leaf index */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_mount_t *mp; /* filesystem mount point */ - int stale; /* count of stale leaves */ + struct xfs_dir3_icfree_hdr hdr; - leaf = bp->b_addr; - mp = dp->i_mount; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); - for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { - if (i + 1 < be16_to_cpu(leaf->hdr.count)) { - ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= - be32_to_cpu(leaf->ents[i + 1].hashval)); - } - if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) - stale++; - } - ASSERT(be16_to_cpu(leaf->hdr.stale) == stale); + xfs_dir3_free_hdr_from_disk(&hdr, bp->b_addr); + + ASSERT((hdr.firstdb % xfs_dir3_free_max_bests(mp)) == 0); + ASSERT(hdr.firstdb <= db); + ASSERT(db < hdr.firstdb + hdr.nvalid); } +#else +#define xfs_dir2_free_hdr_check(mp, dp, db) #endif /* DEBUG */ /* @@ -365,15 +530,22 @@ xfs_dir2_leafn_lasthash( struct xfs_buf *bp, /* leaf buffer */ int *count) /* count of entries in leaf */ { - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); if (count) - *count = be16_to_cpu(leaf->hdr.count); - if (!leaf->hdr.count) + *count = leafhdr.count; + if (!leafhdr.count) return 0; - return be32_to_cpu(leaf->ents[be16_to_cpu(leaf->hdr.count) - 1].hashval); + + ents = xfs_dir3_leaf_ents_p(leaf); + return be32_to_cpu(ents[leafhdr.count - 1].hashval); } /* @@ -402,16 +574,19 @@ xfs_dir2_leafn_lookup_for_addname( xfs_dir2_db_t newdb; /* new data block number */ xfs_dir2_db_t newfdb; /* new free block number */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); -#ifdef __KERNEL__ - ASSERT(be16_to_cpu(leaf->hdr.count) > 0); -#endif - xfs_dir2_leafn_check(dp, bp); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + + xfs_dir3_leaf_check(mp, bp); + ASSERT(leafhdr.count > 0); + /* * Look up the hash value in the leaf entries. */ @@ -424,15 +599,16 @@ xfs_dir2_leafn_lookup_for_addname( curbp = state->extrablk.bp; curfdb = state->extrablk.blkno; free = curbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); } length = xfs_dir2_data_entsize(args->namelen); /* * Loop over leaf entries with the right hash value. */ - for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip stale leaf entries. */ @@ -451,6 +627,8 @@ xfs_dir2_leafn_lookup_for_addname( * in hand, take a look at it. */ if (newdb != curdb) { + __be16 *bests; + curdb = newdb; /* * Convert the data block to the free block @@ -473,13 +651,8 @@ xfs_dir2_leafn_lookup_for_addname( if (error) return error; free = curbp->b_addr; - ASSERT(be32_to_cpu(free->hdr.magic) == - XFS_DIR2_FREE_MAGIC); - ASSERT((be32_to_cpu(free->hdr.firstdb) % - xfs_dir2_free_max_bests(mp)) == 0); - ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb); - ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) + - be32_to_cpu(free->hdr.nvalid)); + + xfs_dir2_free_hdr_check(mp, curbp, curdb); } /* * Get the index for our entry. @@ -488,8 +661,8 @@ xfs_dir2_leafn_lookup_for_addname( /* * If it has room, return it. */ - if (unlikely(free->bests[fi] == - cpu_to_be16(NULLDATAOFF))) { + bests = xfs_dir3_free_bests_p(mp, free); + if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) { XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", XFS_ERRLEVEL_LOW, mp); if (curfdb != newfdb) @@ -497,7 +670,7 @@ xfs_dir2_leafn_lookup_for_addname( return XFS_ERROR(EFSCORRUPTED); } curfdb = newfdb; - if (be16_to_cpu(free->bests[fi]) >= length) + if (be16_to_cpu(bests[fi]) >= length) goto out; } } @@ -511,6 +684,12 @@ out: state->extrablk.bp = curbp; state->extrablk.index = fi; state->extrablk.blkno = curfdb; + + /* + * Important: this magic number is not in the buffer - it's for + * buffer type information and therefore only the free/data type + * matters here, not whether CRCs are enabled or not. + */ state->extrablk.magic = XFS_DIR2_FREE_MAGIC; } else { state->extravalid = 0; @@ -545,16 +724,19 @@ xfs_dir2_leafn_lookup_for_entry( xfs_dir2_db_t newdb; /* new data block number */ xfs_trans_t *tp; /* transaction pointer */ enum xfs_dacmp cmp; /* comparison result */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); -#ifdef __KERNEL__ - ASSERT(be16_to_cpu(leaf->hdr.count) > 0); -#endif - xfs_dir2_leafn_check(dp, bp); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + + xfs_dir3_leaf_check(mp, bp); + ASSERT(leafhdr.count > 0); + /* * Look up the hash value in the leaf entries. */ @@ -569,9 +751,9 @@ xfs_dir2_leafn_lookup_for_entry( /* * Loop over leaf entries with the right hash value. */ - for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip stale leaf entries. */ @@ -604,13 +786,13 @@ xfs_dir2_leafn_lookup_for_entry( ASSERT(state->extravalid); curbp = state->extrablk.bp; } else { - error = xfs_dir2_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, newdb), -1, &curbp); if (error) return error; } - xfs_dir2_data_check(dp, curbp); + xfs_dir3_data_check(dp, curbp); curdb = newdb; } /* @@ -638,13 +820,13 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = (int)((char *)dep - (char *)curbp->b_addr); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - curbp->b_ops = &xfs_dir2_data_buf_ops; + curbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); } } - ASSERT(index == be16_to_cpu(leaf->hdr.count) || - (args->op_flags & XFS_DA_OP_OKNOENT)); + ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT)); if (curbp) { if (args->cmpresult == XFS_CMP_DIFFERENT) { /* Giving back last used data block. */ @@ -653,7 +835,8 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = -1; state->extrablk.blkno = curdb; state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - curbp->b_ops = &xfs_dir2_data_buf_ops; + curbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); } else { /* If the curbp is not the CI match block, drop it */ if (state->extrablk.bp != curbp) @@ -689,52 +872,50 @@ xfs_dir2_leafn_lookup_int( * Log entries and headers. Stale entries are preserved. */ static void -xfs_dir2_leafn_moveents( - xfs_da_args_t *args, /* operation arguments */ - struct xfs_buf *bp_s, /* source leaf buffer */ - int start_s, /* source leaf index */ - struct xfs_buf *bp_d, /* destination leaf buffer */ - int start_d, /* destination leaf index */ - int count) /* count of leaves to copy */ +xfs_dir3_leafn_moveents( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *bp_s, /* source */ + struct xfs_dir3_icleaf_hdr *shdr, + struct xfs_dir2_leaf_entry *sents, + int start_s,/* source leaf index */ + struct xfs_buf *bp_d, /* destination */ + struct xfs_dir3_icleaf_hdr *dhdr, + struct xfs_dir2_leaf_entry *dents, + int start_d,/* destination leaf index */ + int count) /* count of leaves to copy */ { - xfs_dir2_leaf_t *leaf_d; /* destination leaf structure */ - xfs_dir2_leaf_t *leaf_s; /* source leaf structure */ - int stale; /* count stale leaves copied */ - xfs_trans_t *tp; /* transaction pointer */ + struct xfs_trans *tp = args->trans; + int stale; /* count stale leaves copied */ trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count); /* * Silently return if nothing to do. */ - if (count == 0) { + if (count == 0) return; - } - tp = args->trans; - leaf_s = bp_s->b_addr; - leaf_d = bp_d->b_addr; + /* * If the destination index is not the end of the current * destination leaf entries, open up a hole in the destination * to hold the new entries. */ - if (start_d < be16_to_cpu(leaf_d->hdr.count)) { - memmove(&leaf_d->ents[start_d + count], &leaf_d->ents[start_d], - (be16_to_cpu(leaf_d->hdr.count) - start_d) * - sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count, - count + be16_to_cpu(leaf_d->hdr.count) - 1); + if (start_d < dhdr->count) { + memmove(&dents[start_d + count], &dents[start_d], + (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(tp, bp_d, start_d + count, + count + dhdr->count - 1); } /* * If the source has stale leaves, count the ones in the copy range * so we can update the header correctly. */ - if (leaf_s->hdr.stale) { + if (shdr->stale) { int i; /* temp leaf index */ for (i = start_s, stale = 0; i < start_s + count; i++) { - if (leaf_s->ents[i].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + if (sents[i].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; } } else @@ -742,29 +923,27 @@ xfs_dir2_leafn_moveents( /* * Copy the leaf entries from source to destination. */ - memcpy(&leaf_d->ents[start_d], &leaf_s->ents[start_s], + memcpy(&dents[start_d], &sents[start_s], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1); + xfs_dir3_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1); + /* * If there are source entries after the ones we copied, * delete the ones we copied by sliding the next ones down. */ - if (start_s + count < be16_to_cpu(leaf_s->hdr.count)) { - memmove(&leaf_s->ents[start_s], &leaf_s->ents[start_s + count], + if (start_s + count < shdr->count) { + memmove(&sents[start_s], &sents[start_s + count], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1); + xfs_dir3_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1); } + /* * Update the headers and log them. */ - be16_add_cpu(&leaf_s->hdr.count, -(count)); - be16_add_cpu(&leaf_s->hdr.stale, -(stale)); - be16_add_cpu(&leaf_d->hdr.count, count); - be16_add_cpu(&leaf_d->hdr.stale, stale); - xfs_dir2_leaf_log_header(tp, bp_s); - xfs_dir2_leaf_log_header(tp, bp_d); - xfs_dir2_leafn_check(args->dp, bp_s); - xfs_dir2_leafn_check(args->dp, bp_d); + shdr->count -= count; + shdr->stale -= stale; + dhdr->count += count; + dhdr->stale += stale; } /* @@ -773,21 +952,25 @@ xfs_dir2_leafn_moveents( */ int /* sort order */ xfs_dir2_leafn_order( - struct xfs_buf *leaf1_bp, /* leaf1 buffer */ - struct xfs_buf *leaf2_bp) /* leaf2 buffer */ + struct xfs_buf *leaf1_bp, /* leaf1 buffer */ + struct xfs_buf *leaf2_bp) /* leaf2 buffer */ { - xfs_dir2_leaf_t *leaf1; /* leaf1 structure */ - xfs_dir2_leaf_t *leaf2; /* leaf2 structure */ - - leaf1 = leaf1_bp->b_addr; - leaf2 = leaf2_bp->b_addr; - ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - if (be16_to_cpu(leaf1->hdr.count) > 0 && - be16_to_cpu(leaf2->hdr.count) > 0 && - (be32_to_cpu(leaf2->ents[0].hashval) < be32_to_cpu(leaf1->ents[0].hashval) || - be32_to_cpu(leaf2->ents[be16_to_cpu(leaf2->hdr.count) - 1].hashval) < - be32_to_cpu(leaf1->ents[be16_to_cpu(leaf1->hdr.count) - 1].hashval))) + struct xfs_dir2_leaf *leaf1 = leaf1_bp->b_addr; + struct xfs_dir2_leaf *leaf2 = leaf2_bp->b_addr; + struct xfs_dir2_leaf_entry *ents1; + struct xfs_dir2_leaf_entry *ents2; + struct xfs_dir3_icleaf_hdr hdr1; + struct xfs_dir3_icleaf_hdr hdr2; + + xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1); + xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = xfs_dir3_leaf_ents_p(leaf1); + ents2 = xfs_dir3_leaf_ents_p(leaf2); + + if (hdr1.count > 0 && hdr2.count > 0 && + (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) || + be32_to_cpu(ents2[hdr2.count - 1].hashval) < + be32_to_cpu(ents1[hdr1.count - 1].hashval))) return 1; return 0; } @@ -811,11 +994,15 @@ xfs_dir2_leafn_rebalance( xfs_dir2_leaf_t *leaf1; /* first leaf structure */ xfs_dir2_leaf_t *leaf2; /* second leaf structure */ int mid; /* midpoint leaf index */ -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int oldstale; /* old count of stale leaves */ #endif int oldsum; /* old total leaf count */ int swap; /* swapped leaf blocks */ + struct xfs_dir2_leaf_entry *ents1; + struct xfs_dir2_leaf_entry *ents2; + struct xfs_dir3_icleaf_hdr hdr1; + struct xfs_dir3_icleaf_hdr hdr2; args = state->args; /* @@ -830,11 +1017,17 @@ xfs_dir2_leafn_rebalance( } leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; - oldsum = be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count); -#ifdef DEBUG - oldstale = be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale); + xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1); + xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = xfs_dir3_leaf_ents_p(leaf1); + ents2 = xfs_dir3_leaf_ents_p(leaf2); + + oldsum = hdr1.count + hdr2.count; +#if defined(DEBUG) || defined(XFS_WARN) + oldstale = hdr1.stale + hdr2.stale; #endif mid = oldsum >> 1; + /* * If the old leaf count was odd then the new one will be even, * so we need to divide the new count evenly. @@ -842,10 +1035,10 @@ xfs_dir2_leafn_rebalance( if (oldsum & 1) { xfs_dahash_t midhash; /* middle entry hash value */ - if (mid >= be16_to_cpu(leaf1->hdr.count)) - midhash = be32_to_cpu(leaf2->ents[mid - be16_to_cpu(leaf1->hdr.count)].hashval); + if (mid >= hdr1.count) + midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval); else - midhash = be32_to_cpu(leaf1->ents[mid].hashval); + midhash = be32_to_cpu(ents1[mid].hashval); isleft = args->hashval <= midhash; } /* @@ -859,30 +1052,42 @@ xfs_dir2_leafn_rebalance( * Calculate moved entry count. Positive means left-to-right, * negative means right-to-left. Then move the entries. */ - count = be16_to_cpu(leaf1->hdr.count) - mid + (isleft == 0); + count = hdr1.count - mid + (isleft == 0); if (count > 0) - xfs_dir2_leafn_moveents(args, blk1->bp, - be16_to_cpu(leaf1->hdr.count) - count, blk2->bp, 0, count); + xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1, + hdr1.count - count, blk2->bp, + &hdr2, ents2, 0, count); else if (count < 0) - xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp, - be16_to_cpu(leaf1->hdr.count), count); - ASSERT(be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count) == oldsum); - ASSERT(be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale) == oldstale); + xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0, + blk1->bp, &hdr1, ents1, + hdr1.count, count); + + ASSERT(hdr1.count + hdr2.count == oldsum); + ASSERT(hdr1.stale + hdr2.stale == oldstale); + + /* log the changes made when moving the entries */ + xfs_dir3_leaf_hdr_to_disk(leaf1, &hdr1); + xfs_dir3_leaf_hdr_to_disk(leaf2, &hdr2); + xfs_dir3_leaf_log_header(args->trans, blk1->bp); + xfs_dir3_leaf_log_header(args->trans, blk2->bp); + + xfs_dir3_leaf_check(args->dp->i_mount, blk1->bp); + xfs_dir3_leaf_check(args->dp->i_mount, blk2->bp); + /* * Mark whether we're inserting into the old or new leaf. */ - if (be16_to_cpu(leaf1->hdr.count) < be16_to_cpu(leaf2->hdr.count)) + if (hdr1.count < hdr2.count) state->inleaf = swap; - else if (be16_to_cpu(leaf1->hdr.count) > be16_to_cpu(leaf2->hdr.count)) + else if (hdr1.count > hdr2.count) state->inleaf = !swap; else - state->inleaf = - swap ^ (blk1->index <= be16_to_cpu(leaf1->hdr.count)); + state->inleaf = swap ^ (blk1->index <= hdr1.count); /* * Adjust the expected index for insertion. */ if (!state->inleaf) - blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); + blk2->index = blk1->index - hdr1.count; /* * Finally sanity check just to make sure we are not returning a @@ -898,7 +1103,7 @@ xfs_dir2_leafn_rebalance( } static int -xfs_dir2_data_block_free( +xfs_dir3_data_block_free( xfs_da_args_t *args, struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_free *free, @@ -909,57 +1114,66 @@ xfs_dir2_data_block_free( { struct xfs_trans *tp = args->trans; int logfree = 0; + __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; - if (!hdr) { - /* One less used entry in the free table. */ - be32_add_cpu(&free->hdr.nused, -1); - xfs_dir2_free_log_header(tp, fbp); + xfs_dir3_free_hdr_from_disk(&freehdr, free); + bests = xfs_dir3_free_bests_p(tp->t_mountp, free); + if (hdr) { /* - * If this was the last entry in the table, we can trim the - * table size back. There might be other entries at the end - * referring to non-existent data blocks, get those too. + * Data block is not empty, just set the free entry to the new + * value. */ - if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { - int i; /* free entry index */ + bests[findex] = cpu_to_be16(longest); + xfs_dir2_free_log_bests(tp, fbp, findex, findex); + return 0; + } - for (i = findex - 1; i >= 0; i--) { - if (free->bests[i] != cpu_to_be16(NULLDATAOFF)) - break; - } - free->hdr.nvalid = cpu_to_be32(i + 1); - logfree = 0; - } else { - /* Not the last entry, just punch it out. */ - free->bests[findex] = cpu_to_be16(NULLDATAOFF); - logfree = 1; - } - /* - * If there are no useful entries left in the block, - * get rid of the block if we can. - */ - if (!free->hdr.nused) { - int error; + /* One less used entry in the free table. */ + freehdr.nused--; - error = xfs_dir2_shrink_inode(args, fdb, fbp); - if (error == 0) { - fbp = NULL; - logfree = 0; - } else if (error != ENOSPC || args->total != 0) - return error; - /* - * It's possible to get ENOSPC if there is no - * space reservation. In this case some one - * else will eventually get rid of this block. - */ + /* + * If this was the last entry in the table, we can trim the table size + * back. There might be other entries at the end referring to + * non-existent data blocks, get those too. + */ + if (findex == freehdr.nvalid - 1) { + int i; /* free entry index */ + + for (i = findex - 1; i >= 0; i--) { + if (bests[i] != cpu_to_be16(NULLDATAOFF)) + break; } + freehdr.nvalid = i + 1; + logfree = 0; } else { + /* Not the last entry, just punch it out. */ + bests[findex] = cpu_to_be16(NULLDATAOFF); + logfree = 1; + } + + xfs_dir3_free_hdr_to_disk(free, &freehdr); + xfs_dir2_free_log_header(tp, fbp); + + /* + * If there are no useful entries left in the block, get rid of the + * block if we can. + */ + if (!freehdr.nused) { + int error; + + error = xfs_dir2_shrink_inode(args, fdb, fbp); + if (error == 0) { + fbp = NULL; + logfree = 0; + } else if (error != ENOSPC || args->total != 0) + return error; /* - * Data block is not empty, just set the free entry to the new - * value. + * It's possible to get ENOSPC if there is no + * space reservation. In this case some one + * else will eventually get rid of this block. */ - free->bests[findex] = cpu_to_be16(longest); - logfree = 1; } /* Log the free entry that changed, unless we got rid of it. */ @@ -994,6 +1208,9 @@ xfs_dir2_leafn_remove( int needlog; /* need to log data header */ int needscan; /* need to rescan data frees */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leafn_remove(args, index); @@ -1001,11 +1218,14 @@ xfs_dir2_leafn_remove( tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + /* * Point to the entry we're removing. */ - lep = &leaf->ents[index]; + lep = &ents[index]; + /* * Extract the data block and offset from the entry. */ @@ -1013,14 +1233,18 @@ xfs_dir2_leafn_remove( ASSERT(dblk->blkno == db); off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)); ASSERT(dblk->index == off); + /* * Kill the leaf entry by marking it stale. * Log the leaf block changes. */ - be16_add_cpu(&leaf->hdr.stale, 1); - xfs_dir2_leaf_log_header(tp, bp); + leafhdr.stale++; + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, bp); + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir2_leaf_log_ents(tp, bp, index, index); + xfs_dir3_leaf_log_ents(tp, bp, index, index); + /* * Make the data entry free. Keep track of the longest freespace * in the data block in case it changes. @@ -1028,7 +1252,8 @@ xfs_dir2_leafn_remove( dbp = dblk->bp; hdr = dbp->b_addr; dep = (xfs_dir2_data_entry_t *)((char *)hdr + off); - longest = be16_to_cpu(hdr->bestfree[0].length); + bf = xfs_dir3_data_bestfree_p(hdr); + longest = be16_to_cpu(bf[0].length); needlog = needscan = 0; xfs_dir2_data_make_free(tp, dbp, off, xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); @@ -1040,12 +1265,12 @@ xfs_dir2_leafn_remove( xfs_dir2_data_freescan(mp, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(tp, dbp); - xfs_dir2_data_check(dp, dbp); + xfs_dir3_data_check(dp, dbp); /* * If the longest data block freespace changes, need to update * the corresponding freeblock entry. */ - if (longest < be16_to_cpu(hdr->bestfree[0].length)) { + if (longest < be16_to_cpu(bf[0].length)) { int error; /* error return value */ struct xfs_buf *fbp; /* freeblock buffer */ xfs_dir2_db_t fdb; /* freeblock block number */ @@ -1062,20 +1287,25 @@ xfs_dir2_leafn_remove( if (error) return error; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - ASSERT(be32_to_cpu(free->hdr.firstdb) == - xfs_dir2_free_max_bests(mp) * - (fdb - XFS_DIR2_FREE_FIRSTDB(mp))); +#ifdef DEBUG + { + struct xfs_dir3_icfree_hdr freehdr; + xfs_dir3_free_hdr_from_disk(&freehdr, free); + ASSERT(freehdr.firstdb == xfs_dir3_free_max_bests(mp) * + (fdb - XFS_DIR2_FREE_FIRSTDB(mp))); + } +#endif /* * Calculate which entry we need to fix. */ findex = xfs_dir2_db_to_fdindex(mp, db); - longest = be16_to_cpu(hdr->bestfree[0].length); + longest = be16_to_cpu(bf[0].length); /* * If the data block is now empty we can get rid of it * (usually). */ - if (longest == mp->m_dirblksize - (uint)sizeof(*hdr)) { + if (longest == mp->m_dirblksize - + xfs_dir3_data_entry_offset(hdr)) { /* * Try to punch out the data block. */ @@ -1096,21 +1326,19 @@ xfs_dir2_leafn_remove( * If we got rid of the data block, we can eliminate that entry * in the free block. */ - error = xfs_dir2_data_block_free(args, hdr, free, + error = xfs_dir3_data_block_free(args, hdr, free, fdb, findex, fbp, longest); if (error) return error; } - xfs_dir2_leafn_check(dp, bp); + xfs_dir3_leaf_check(mp, bp); /* * Return indication of whether this leaf block is empty enough * to justify trying to join it with a neighbor. */ - *rval = - ((uint)sizeof(leaf->hdr) + - (uint)sizeof(leaf->ents[0]) * - (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale))) < + *rval = (xfs_dir3_leaf_hdr_size(leaf) + + (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) < mp->m_dir_magicpct; return 0; } @@ -1143,11 +1371,11 @@ xfs_dir2_leafn_split( /* * Initialize the new leaf block. */ - error = xfs_dir2_leaf_init(args, xfs_dir2_da_to_db(mp, blkno), - &newblk->bp, XFS_DIR2_LEAFN_MAGIC); - if (error) { + error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(mp, blkno), + &newblk->bp, XFS_DIR2_LEAFN_MAGIC); + if (error) return error; - } + newblk->blkno = blkno; newblk->magic = XFS_DIR2_LEAFN_MAGIC; /* @@ -1155,7 +1383,7 @@ xfs_dir2_leafn_split( * block into the leaves. */ xfs_dir2_leafn_rebalance(state, oldblk, newblk); - error = xfs_da_blk_link(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); if (error) { return error; } @@ -1171,8 +1399,8 @@ xfs_dir2_leafn_split( */ oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL); newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL); - xfs_dir2_leafn_check(args->dp, oldblk->bp); - xfs_dir2_leafn_check(args->dp, newblk->bp); + xfs_dir3_leaf_check(mp, oldblk->bp); + xfs_dir3_leaf_check(mp, newblk->bp); return error; } @@ -1198,9 +1426,10 @@ xfs_dir2_leafn_toosmall( int error; /* error return value */ int forward; /* sibling block direction */ int i; /* sibling counter */ - xfs_da_blkinfo_t *info; /* leaf block header */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int rval; /* result from path_shift */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; /* * Check for the degenerate case of the block being over 50% full. @@ -1208,11 +1437,13 @@ xfs_dir2_leafn_toosmall( * to coalesce with a sibling. */ blk = &state->path.blk[state->path.active - 1]; - info = blk->bp->b_addr; - ASSERT(info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - leaf = (xfs_dir2_leaf_t *)info; - count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); - bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]); + leaf = blk->bp->b_addr; + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + xfs_dir3_leaf_check(state->args->dp->i_mount, blk->bp); + + count = leafhdr.count - leafhdr.stale; + bytes = xfs_dir3_leaf_hdr_size(leaf) + count * sizeof(ents[0]); if (bytes > (state->blocksize >> 1)) { /* * Blk over 50%, don't try to join. @@ -1231,9 +1462,9 @@ xfs_dir2_leafn_toosmall( * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ - forward = (info->forw != 0); + forward = (leafhdr.forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); - error = xfs_da_path_shift(state, &state->altpath, forward, 0, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &rval); if (error) return error; @@ -1247,15 +1478,17 @@ xfs_dir2_leafn_toosmall( * We prefer coalescing with the lower numbered sibling so as * to shrink a directory over time. */ - forward = be32_to_cpu(info->forw) < be32_to_cpu(info->back); + forward = leafhdr.forw < leafhdr.back; for (i = 0, bp = NULL; i < 2; forward = !forward, i++) { - blkno = forward ? be32_to_cpu(info->forw) : be32_to_cpu(info->back); + struct xfs_dir3_icleaf_hdr hdr2; + + blkno = forward ? leafhdr.forw : leafhdr.back; if (blkno == 0) continue; /* * Read the sibling leaf block. */ - error = xfs_dir2_leafn_read(state->args->trans, state->args->dp, + error = xfs_dir3_leafn_read(state->args->trans, state->args->dp, blkno, -1, &bp); if (error) return error; @@ -1263,13 +1496,15 @@ xfs_dir2_leafn_toosmall( /* * Count bytes in the two blocks combined. */ - leaf = (xfs_dir2_leaf_t *)info; - count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); + count = leafhdr.count - leafhdr.stale; bytes = state->blocksize - (state->blocksize >> 2); + leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); - bytes -= count * (uint)sizeof(leaf->ents[0]); + xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + count += hdr2.count - hdr2.stale; + bytes -= count * sizeof(ents[0]); + /* * Fits with at least 25% to spare. */ @@ -1291,10 +1526,10 @@ xfs_dir2_leafn_toosmall( */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) - error = xfs_da_path_shift(state, &state->altpath, forward, 0, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &rval); else - error = xfs_da_path_shift(state, &state->path, forward, 0, + error = xfs_da3_path_shift(state, &state->path, forward, 0, &rval); if (error) { return error; @@ -1316,34 +1551,53 @@ xfs_dir2_leafn_unbalance( xfs_da_args_t *args; /* operation arguments */ xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */ xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */ + struct xfs_dir3_icleaf_hdr savehdr; + struct xfs_dir3_icleaf_hdr drophdr; + struct xfs_dir2_leaf_entry *sents; + struct xfs_dir2_leaf_entry *dents; args = state->args; ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC); drop_leaf = drop_blk->bp->b_addr; save_leaf = save_blk->bp->b_addr; - ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + + xfs_dir3_leaf_hdr_from_disk(&savehdr, save_leaf); + xfs_dir3_leaf_hdr_from_disk(&drophdr, drop_leaf); + sents = xfs_dir3_leaf_ents_p(save_leaf); + dents = xfs_dir3_leaf_ents_p(drop_leaf); + /* * If there are any stale leaf entries, take this opportunity * to purge them. */ - if (drop_leaf->hdr.stale) - xfs_dir2_leaf_compact(args, drop_blk->bp); - if (save_leaf->hdr.stale) - xfs_dir2_leaf_compact(args, save_blk->bp); + if (drophdr.stale) + xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp); + if (savehdr.stale) + xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp); + /* * Move the entries from drop to the appropriate end of save. */ - drop_blk->hashval = be32_to_cpu(drop_leaf->ents[be16_to_cpu(drop_leaf->hdr.count) - 1].hashval); + drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval); if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp)) - xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0, - be16_to_cpu(drop_leaf->hdr.count)); + xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0, + save_blk->bp, &savehdr, sents, 0, + drophdr.count); else - xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, - be16_to_cpu(save_leaf->hdr.count), be16_to_cpu(drop_leaf->hdr.count)); - save_blk->hashval = be32_to_cpu(save_leaf->ents[be16_to_cpu(save_leaf->hdr.count) - 1].hashval); - xfs_dir2_leafn_check(args->dp, save_blk->bp); + xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0, + save_blk->bp, &savehdr, sents, + savehdr.count, drophdr.count); + save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval); + + /* log the changes made when moving the entries */ + xfs_dir3_leaf_hdr_to_disk(save_leaf, &savehdr); + xfs_dir3_leaf_hdr_to_disk(drop_leaf, &drophdr); + xfs_dir3_leaf_log_header(args->trans, save_blk->bp); + xfs_dir3_leaf_log_header(args->trans, drop_blk->bp); + + xfs_dir3_leaf_check(args->dp->i_mount, save_blk->bp); + xfs_dir3_leaf_check(args->dp->i_mount, drop_blk->bp); } /* @@ -1372,7 +1626,7 @@ xfs_dir2_node_addname( * Look up the name. We're not supposed to find it, but * this gives us the insertion point. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) rval = error; if (rval != ENOENT) { @@ -1398,7 +1652,7 @@ xfs_dir2_node_addname( * It worked, fix the hash values up the btree. */ if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); } else { /* * It didn't work, we need to split the leaf block. @@ -1410,7 +1664,7 @@ xfs_dir2_node_addname( /* * Split the leaf block and insert the new entry. */ - rval = xfs_da_split(state); + rval = xfs_da3_split(state); } done: xfs_da_state_free(state); @@ -1447,6 +1701,9 @@ xfs_dir2_node_addname_int( int needscan; /* need to rescan data frees */ __be16 *tagp; /* data entry tag pointer */ xfs_trans_t *tp; /* transaction pointer */ + __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_dir2_data_free *bf; dp = args->dp; mp = dp->i_mount; @@ -1464,36 +1721,37 @@ xfs_dir2_node_addname_int( */ ifbno = fblk->blkno; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = fblk->index; + bests = xfs_dir3_free_bests_p(mp, free); + xfs_dir3_free_hdr_from_disk(&freehdr, free); + /* * This means the free entry showed that the data block had * space for our entry, so we remembered it. * Use that data block. */ if (findex >= 0) { - ASSERT(findex < be32_to_cpu(free->hdr.nvalid)); - ASSERT(be16_to_cpu(free->bests[findex]) != NULLDATAOFF); - ASSERT(be16_to_cpu(free->bests[findex]) >= length); - dbno = be32_to_cpu(free->hdr.firstdb) + findex; - } - /* - * The data block looked at didn't have enough room. - * We'll start at the beginning of the freespace entries. - */ - else { + ASSERT(findex < freehdr.nvalid); + ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); + ASSERT(be16_to_cpu(bests[findex]) >= length); + dbno = freehdr.firstdb + findex; + } else { + /* + * The data block looked at didn't have enough room. + * We'll start at the beginning of the freespace entries. + */ dbno = -1; findex = 0; } - } - /* - * Didn't come in with a freespace block, so don't have a data block. - */ - else { + } else { + /* + * Didn't come in with a freespace block, so no data block. + */ ifbno = dbno = -1; fbp = NULL; findex = 0; } + /* * If we don't have a data block yet, we're going to scan the * freespace blocks looking for one. Figure out what the @@ -1547,20 +1805,26 @@ xfs_dir2_node_addname_int( if (!fbp) continue; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = 0; } /* * Look at the current free entry. Is it good enough? + * + * The bests initialisation should be where the bufer is read in + * the above branch. But gcc is too stupid to realise that bests + * and the freehdr are actually initialised if they are placed + * there, so we have to do it here to avoid warnings. Blech. */ - if (be16_to_cpu(free->bests[findex]) != NULLDATAOFF && - be16_to_cpu(free->bests[findex]) >= length) - dbno = be32_to_cpu(free->hdr.firstdb) + findex; + bests = xfs_dir3_free_bests_p(mp, free); + xfs_dir3_free_hdr_from_disk(&freehdr, free); + if (be16_to_cpu(bests[findex]) != NULLDATAOFF && + be16_to_cpu(bests[findex]) >= length) + dbno = freehdr.firstdb + findex; else { /* * Are we done with the freeblock? */ - if (++findex == be32_to_cpu(free->hdr.nvalid)) { + if (++findex == freehdr.nvalid) { /* * Drop the block. */ @@ -1588,7 +1852,7 @@ xfs_dir2_node_addname_int( if (unlikely((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &dbno)) || - (error = xfs_dir2_data_init(args, dbno, &dbp)))) + (error = xfs_dir3_data_init(args, dbno, &dbp)))) return error; /* @@ -1614,11 +1878,11 @@ xfs_dir2_node_addname_int( * If there wasn't a freespace block, the read will * return a NULL fbp. Allocate and initialize a new one. */ - if( fbp == NULL ) { - if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, - &fbno))) { + if (!fbp) { + error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, + &fbno); + if (error) return error; - } if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) { xfs_alert(mp, @@ -1646,27 +1910,22 @@ xfs_dir2_node_addname_int( /* * Get a buffer for the new block. */ - error = xfs_da_get_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), - -1, &fbp, XFS_DATA_FORK); + error = xfs_dir3_free_get_buf(tp, dp, fbno, &fbp); if (error) return error; - fbp->b_ops = &xfs_dir2_free_buf_ops; + free = fbp->b_addr; + bests = xfs_dir3_free_bests_p(mp, free); + xfs_dir3_free_hdr_from_disk(&freehdr, free); /* - * Initialize the new block to be empty, and remember - * its first slot as our empty slot. + * Remember the first slot as our empty slot. */ - free = fbp->b_addr; - free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); - free->hdr.firstdb = cpu_to_be32( - (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * - xfs_dir2_free_max_bests(mp)); - free->hdr.nvalid = 0; - free->hdr.nused = 0; + freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * + xfs_dir3_free_max_bests(mp); } else { free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); + bests = xfs_dir3_free_bests_p(mp, free); + xfs_dir3_free_hdr_from_disk(&freehdr, free); } /* @@ -1677,20 +1936,21 @@ xfs_dir2_node_addname_int( * If it's after the end of the current entries in the * freespace block, extend that table. */ - if (findex >= be32_to_cpu(free->hdr.nvalid)) { - ASSERT(findex < xfs_dir2_free_max_bests(mp)); - free->hdr.nvalid = cpu_to_be32(findex + 1); + if (findex >= freehdr.nvalid) { + ASSERT(findex < xfs_dir3_free_max_bests(mp)); + freehdr.nvalid = findex + 1; /* * Tag new entry so nused will go up. */ - free->bests[findex] = cpu_to_be16(NULLDATAOFF); + bests[findex] = cpu_to_be16(NULLDATAOFF); } /* * If this entry was for an empty data block * (this should always be true) then update the header. */ - if (free->bests[findex] == cpu_to_be16(NULLDATAOFF)) { - be32_add_cpu(&free->hdr.nused, 1); + if (bests[findex] == cpu_to_be16(NULLDATAOFF)) { + freehdr.nused++; + xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr); xfs_dir2_free_log_header(tp, fbp); } /* @@ -1699,7 +1959,8 @@ xfs_dir2_node_addname_int( * change again. */ hdr = dbp->b_addr; - free->bests[findex] = hdr->bestfree[0].length; + bf = xfs_dir3_data_bestfree_p(hdr); + bests[findex] = bf[0].length; logfree = 1; } /* @@ -1715,19 +1976,20 @@ xfs_dir2_node_addname_int( /* * Read the data block in. */ - error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno), + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno), -1, &dbp); if (error) return error; hdr = dbp->b_addr; + bf = xfs_dir3_data_bestfree_p(hdr); logfree = 0; } - ASSERT(be16_to_cpu(hdr->bestfree[0].length) >= length); + ASSERT(be16_to_cpu(bf[0].length) >= length); /* * Point to the existing unused space. */ dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset)); + ((char *)hdr + be16_to_cpu(bf[0].offset)); needscan = needlog = 0; /* * Mark the first part of the unused space, inuse for us. @@ -1758,8 +2020,9 @@ xfs_dir2_node_addname_int( /* * If the freespace entry is now wrong, update it. */ - if (be16_to_cpu(free->bests[findex]) != be16_to_cpu(hdr->bestfree[0].length)) { - free->bests[findex] = hdr->bestfree[0].length; + bests = xfs_dir3_free_bests_p(mp, free); /* gcc is so stupid */ + if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) { + bests[findex] = bf[0].length; logfree = 1; } /* @@ -1777,7 +2040,7 @@ xfs_dir2_node_addname_int( /* * Lookup an entry in a node-format directory. - * All the real work happens in xfs_da_node_lookup_int. + * All the real work happens in xfs_da3_node_lookup_int. * The only real output is the inode number of the entry. */ int /* error */ @@ -1802,7 +2065,7 @@ xfs_dir2_node_lookup( /* * Fill in the path to the entry in the cursor. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) rval = error; else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) { @@ -1857,7 +2120,7 @@ xfs_dir2_node_removename( /* * Look up the entry we're deleting, set up the cursor. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) rval = error; /* @@ -1881,12 +2144,12 @@ xfs_dir2_node_removename( /* * Fix the hash values up the btree. */ - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); /* * If we need to join leaf blocks, do it. */ if (rval && state->path.active > 1) - error = xfs_da_join(state); + error = xfs_da3_join(state); /* * If no errors so far, try conversion to leaf format. */ @@ -1928,7 +2191,7 @@ xfs_dir2_node_replace( /* * Lookup the entry to change in the btree. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) { rval = error; } @@ -1937,19 +2200,22 @@ xfs_dir2_node_replace( * and locked it. But paranoia is good. */ if (rval == EEXIST) { + struct xfs_dir2_leaf_entry *ents; /* * Find the leaf entry. */ blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); leaf = blk->bp->b_addr; - lep = &leaf->ents[blk->index]; + ents = xfs_dir3_leaf_ents_p(leaf); + lep = &ents[blk->index]; ASSERT(state->extravalid); /* * Point to the data entry. */ hdr = state->extrablk.bp->b_addr; - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); dep = (xfs_dir2_data_entry_t *) ((char *)hdr + xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address))); @@ -1995,6 +2261,7 @@ xfs_dir2_node_trim_free( xfs_dir2_free_t *free; /* freespace structure */ xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icfree_hdr freehdr; dp = args->dp; mp = dp->i_mount; @@ -2012,11 +2279,12 @@ xfs_dir2_node_trim_free( if (!bp) return 0; free = bp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); + xfs_dir3_free_hdr_from_disk(&freehdr, free); + /* * If there are used entries, there's nothing to do. */ - if (be32_to_cpu(free->hdr.nused) > 0) { + if (freehdr.nused > 0) { xfs_trans_brelse(tp, bp); *rvalp = 0; return 0; diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 7da79f6515fd..0511cda4a712 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -30,11 +30,11 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const unsigned char *name, int len); /* xfs_dir2_block.c */ -extern const struct xfs_buf_ops xfs_dir2_block_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; extern int xfs_dir2_block_addname(struct xfs_da_args *args); -extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, - xfs_off_t *offset, filldir_t filldir); +extern int xfs_dir2_block_getdents(struct xfs_inode *dp, + struct dir_context *ctx); extern int xfs_dir2_block_lookup(struct xfs_da_args *args); extern int xfs_dir2_block_removename(struct xfs_da_args *args); extern int xfs_dir2_block_replace(struct xfs_da_args *args); @@ -43,17 +43,18 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, /* xfs_dir2_data.c */ #ifdef DEBUG -#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp); +#define xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp); #else -#define xfs_dir2_data_check(dp,bp) +#define xfs_dir3_data_check(dp,bp) #endif -extern const struct xfs_buf_ops xfs_dir2_data_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_data_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_free_buf_ops; -extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); -extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, +extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); -extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp, +extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno); extern struct xfs_dir2_data_free * @@ -61,7 +62,7 @@ xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup, int *loghead); extern void xfs_dir2_data_freescan(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr, int *loghead); -extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, +extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, struct xfs_buf **bpp); extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_dir2_data_entry *dep); @@ -77,24 +78,26 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ -extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; -extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, +extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); -extern void xfs_dir2_leaf_compact(struct xfs_da_args *args, - struct xfs_buf *bp); -extern void xfs_dir2_leaf_compact_x1(struct xfs_buf *bp, int *indexp, +extern void xfs_dir3_leaf_compact(struct xfs_da_args *args, + struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp); +extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int *indexp, int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); -extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent, - size_t bufsize, xfs_off_t *offset, filldir_t filldir); -extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno, - struct xfs_buf **bpp, int magic); -extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, +extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, struct dir_context *ctx, + size_t bufsize); +extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, + struct xfs_buf **bpp, __uint16_t magic); +extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); -extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp, +extern void xfs_dir3_leaf_log_header(struct xfs_trans *tp, struct xfs_buf *bp); extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); @@ -104,11 +107,18 @@ extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args, extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args, struct xfs_buf *lbp, xfs_dir2_db_t db); extern struct xfs_dir2_leaf_entry * -xfs_dir2_leaf_find_entry(struct xfs_dir2_leaf *leaf, int index, int compact, - int lowstale, int highstale, - int *lfloglow, int *lfloghigh); +xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int index, int compact, + int lowstale, int highstale, int *lfloglow, int *lfloghigh); extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); +extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from); +extern void xfs_dir3_leaf_hdr_to_disk(struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from); +extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, + struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf); + /* xfs_dir2_node.c */ extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_buf *lbp); @@ -143,8 +153,7 @@ extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp, int size, xfs_dir2_sf_hdr_t *sfhp); extern int xfs_dir2_sf_addname(struct xfs_da_args *args); extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); -extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent, - xfs_off_t *offset, filldir_t filldir); +extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct dir_context *ctx); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 1b9fc3ec7e4b..97676a347da1 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -278,7 +278,7 @@ xfs_dir2_block_to_sf( * Set up to loop over the block's entries. */ btp = xfs_dir2_block_tail_p(mp, hdr); - ptr = (char *)(hdr + 1); + ptr = (char *)xfs_dir3_data_entry_p(hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); sfep = xfs_dir2_sf_firstentry(sfp); /* @@ -535,7 +535,7 @@ xfs_dir2_sf_addname_hard( * to insert the new entry. * If it's going to end up at the end then oldsfep will point there. */ - for (offset = XFS_DIR2_DATA_FIRST_OFFSET, + for (offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount), oldsfep = xfs_dir2_sf_firstentry(oldsfp), add_datasize = xfs_dir2_data_entsize(args->namelen), eof = (char *)oldsfep == &buf[old_isize]; @@ -617,7 +617,7 @@ xfs_dir2_sf_addname_pick( sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; size = xfs_dir2_data_entsize(args->namelen); - offset = XFS_DIR2_DATA_FIRST_OFFSET; + offset = XFS_DIR3_DATA_FIRST_OFFSET(mp); sfep = xfs_dir2_sf_firstentry(sfp); holefit = 0; /* @@ -688,7 +688,7 @@ xfs_dir2_sf_check( dp = args->dp; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - offset = XFS_DIR2_DATA_FIRST_OFFSET; + offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount); ino = xfs_dir2_sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; @@ -768,9 +768,7 @@ xfs_dir2_sf_create( int /* error */ xfs_dir2_sf_getdents( xfs_inode_t *dp, /* incore directory inode */ - void *dirent, - xfs_off_t *offset, - filldir_t filldir) + struct dir_context *ctx) { int i; /* shortform entry number */ xfs_mount_t *mp; /* filesystem mount point */ @@ -802,7 +800,7 @@ xfs_dir2_sf_getdents( /* * If the block number in the offset is out of range, we're done. */ - if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) + if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk) return 0; /* @@ -812,29 +810,27 @@ xfs_dir2_sf_getdents( * mp->m_dirdatablk. */ dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOT_OFFSET); + XFS_DIR3_DATA_DOT_OFFSET(mp)); dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOTDOT_OFFSET); + XFS_DIR3_DATA_DOTDOT_OFFSET(mp)); /* * Put . entry unless we're starting past it. */ - if (*offset <= dot_offset) { - if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) { - *offset = dot_offset & 0x7fffffff; + if (ctx->pos <= dot_offset) { + ctx->pos = dot_offset & 0x7fffffff; + if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR)) return 0; - } } /* * Put .. entry unless we're starting past it. */ - if (*offset <= dotdot_offset) { + if (ctx->pos <= dotdot_offset) { ino = xfs_dir2_sf_get_parent_ino(sfp); - if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) { - *offset = dotdot_offset & 0x7fffffff; + ctx->pos = dotdot_offset & 0x7fffffff; + if (!dir_emit(ctx, "..", 2, ino, DT_DIR)) return 0; - } } /* @@ -845,21 +841,20 @@ xfs_dir2_sf_getdents( off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, xfs_dir2_sf_get_offset(sfep)); - if (*offset > off) { + if (ctx->pos > off) { sfep = xfs_dir2_sf_nextentry(sfp, sfep); continue; } ino = xfs_dir2_sfe_get_ino(sfp, sfep); - if (filldir(dirent, (char *)sfep->name, sfep->namelen, - off & 0x7fffffff, ino, DT_UNKNOWN)) { - *offset = off & 0x7fffffff; + ctx->pos = off & 0x7fffffff; + if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, + ino, DT_UNKNOWN)) return 0; - } sfep = xfs_dir2_sf_nextentry(sfp, sfep); } - *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & + ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & 0x7fffffff; return 0; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 8025eb23ad72..f01012de06d0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -36,6 +36,7 @@ #include "xfs_trans_space.h" #include "xfs_trans_priv.h" #include "xfs_qm.h" +#include "xfs_cksum.h" #include "xfs_trace.h" /* @@ -85,17 +86,23 @@ xfs_qm_dqdestroy( */ void xfs_qm_adjust_dqlimits( - xfs_mount_t *mp, - xfs_disk_dquot_t *d) + struct xfs_mount *mp, + struct xfs_dquot *dq) { - xfs_quotainfo_t *q = mp->m_quotainfo; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_disk_dquot *d = &dq->q_core; + int prealloc = 0; ASSERT(d->d_id); - if (q->qi_bsoftlimit && !d->d_blk_softlimit) + if (q->qi_bsoftlimit && !d->d_blk_softlimit) { d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit); - if (q->qi_bhardlimit && !d->d_blk_hardlimit) + prealloc = 1; + } + if (q->qi_bhardlimit && !d->d_blk_hardlimit) { d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit); + prealloc = 1; + } if (q->qi_isoftlimit && !d->d_ino_softlimit) d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit); if (q->qi_ihardlimit && !d->d_ino_hardlimit) @@ -104,6 +111,9 @@ xfs_qm_adjust_dqlimits( d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit); if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit) d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit); + + if (prealloc) + xfs_dquot_set_prealloc_limits(dq); } /* @@ -239,6 +249,11 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_flags = type; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } } xfs_trans_dquot_buf(tp, bp, @@ -248,25 +263,95 @@ xfs_qm_init_dquot_blk( xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } -static void +/* + * Initialize the dynamic speculative preallocation thresholds. The lo/hi + * watermarks correspond to the soft and hard limits by default. If a soft limit + * is not specified, we use 95% of the hard limit. + */ +void +xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) +{ + __uint64_t space; + + dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit); + dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit); + if (!dqp->q_prealloc_lo_wmark) { + dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark; + do_div(dqp->q_prealloc_lo_wmark, 100); + dqp->q_prealloc_lo_wmark *= 95; + } + + space = dqp->q_prealloc_hi_wmark; + + do_div(space, 100); + dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space; + dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3; + dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; +} + +STATIC bool +xfs_dquot_buf_verify_crc( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + int ndquots; + int i; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return true; + + /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_qm_calc_dquots_per_chunk(mp, + XFS_BB_TO_FSB(mp, bp->b_length)); + + for (i = 0; i < ndquots; i++, d++) { + if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF)) + return false; + if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) + return false; + } + return true; +} + +STATIC bool xfs_dquot_buf_verify( + struct xfs_mount *mp, struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; - struct xfs_disk_dquot *ddq; xfs_dqid_t id = 0; + int ndquots; int i; /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_qm_calc_dquots_per_chunk(mp, bp->b_length); + + /* * On the first read of the buffer, verify that each dquot is valid. * We don't know what the id of the dquot is supposed to be, just that * they should be increasing monotonically within the buffer. If the * first id is corrupt, then it will fail on the second dquot in the * buffer so corruptions could point to the wrong dquot in this case. */ - for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { - int error; + for (i = 0; i < ndquots; i++) { + struct xfs_disk_dquot *ddq; + int error; ddq = &d[i].dd_diskdq; @@ -274,27 +359,41 @@ xfs_dquot_buf_verify( id = be32_to_cpu(ddq->d_id); error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, - "xfs_dquot_read_verify"); - if (error) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d); - xfs_buf_ioerror(bp, EFSCORRUPTED); - break; - } + "xfs_dquot_buf_verify"); + if (error) + return false; } + return true; } static void xfs_dquot_buf_read_verify( struct xfs_buf *bp) { - xfs_dquot_buf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify_crc(mp, bp) || !xfs_dquot_buf_verify(mp, bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } +/* + * we don't calculate the CRC here as that is done when the dquot is flushed to + * the buffer after the update is done. This ensures that the dquot in the + * buffer always has an up-to-date CRC value. + */ void xfs_dquot_buf_write_verify( struct xfs_buf *bp) { - xfs_dquot_buf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify(mp, bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } } const struct xfs_buf_ops xfs_dquot_buf_ops = { @@ -471,13 +570,13 @@ xfs_qm_dqtobp( xfs_buf_t **O_bpp, uint flags) { - xfs_bmbt_irec_t map; - int nmaps = 1, error; - xfs_buf_t *bp; - xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); - xfs_mount_t *mp = dqp->q_mount; - xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); - xfs_trans_t *tp = (tpp ? *tpp : NULL); + struct xfs_bmbt_irec map; + int nmaps = 1, error; + struct xfs_buf *bp; + struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp); + struct xfs_mount *mp = dqp->q_mount; + xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); + struct xfs_trans *tp = (tpp ? *tpp : NULL); dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; @@ -648,6 +747,9 @@ xfs_qm_dqread( dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); + /* initialize the dquot speculative prealloc thresholds */ + xfs_dquot_set_prealloc_limits(dqp); + /* Mark the buf so that this will stay incore a little longer */ xfs_buf_set_ref(bp, XFS_DQUOT_REF); @@ -702,7 +804,7 @@ xfs_qm_dqget( xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ { struct xfs_quotainfo *qi = mp->m_quotainfo; - struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; int error; @@ -1035,6 +1137,23 @@ xfs_qm_dqflush( &dqp->q_logitem.qli_item.li_lsn); /* + * copy the lsn into the on-disk dquot now while we have the in memory + * dquot here. This can't be done later in the write verifier as we + * can't get access to the log item at that point in time. + * + * We also calculate the CRC here so that the on-disk dquot in the + * buffer always has a valid CRC. This ensures there is no possibility + * of a dquot without an up-to-date CRC getting to disk. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; + + dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + + /* * Attach an iodone routine so that we can remove this dquot from the * AIL and release the flush lock once the dquot is synced to disk. */ diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index c694a8469c4a..b596626249b8 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -32,6 +32,13 @@ struct xfs_mount; struct xfs_trans; +enum { + XFS_QLOWSP_1_PCNT = 0, + XFS_QLOWSP_3_PCNT, + XFS_QLOWSP_5_PCNT, + XFS_QLOWSP_MAX +}; + /* * The incore dquot structure */ @@ -51,6 +58,9 @@ typedef struct xfs_dquot { xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ + xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */ + xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */ + int64_t q_low_space[XFS_QLOWSP_MAX]; struct mutex q_qlock; /* quota lock */ struct completion q_flush; /* flush completion queue */ atomic_t q_pincount; /* dquot pin count */ @@ -133,10 +143,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) #define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) #define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) -#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo) -#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \ - XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \ - XFS_DQ_TO_QINF(dqp)->qi_gquotaip) extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); @@ -145,14 +151,16 @@ extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); -extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, - xfs_disk_dquot_t *); +extern void xfs_qm_adjust_dqlimits(struct xfs_mount *, + struct xfs_dquot *); extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, xfs_dqid_t, uint, uint, xfs_dquot_t **); extern void xfs_qm_dqput(xfs_dquot_t *); extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); +extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); + static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { xfs_dqlock(dqp); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 610456054dc2..35d3f5b041dd 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -66,7 +66,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression, int i; int64_t fsid; - if (random32() % randfactor) + if (prandom_u32() % randfactor) return 0; memcpy(&fsid, fsidp, sizeof(xfs_fsid_t)); @@ -178,7 +178,7 @@ xfs_corruption_error( inst_t *ra) { if (level <= xfs_error_level) - xfs_hex_dump(p, 16); + xfs_hex_dump(p, 64); xfs_error_report(tag, level, mp, filename, linenum, ra); xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index feb36d7551ae..452920a3f03f 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -50,9 +50,8 @@ xfs_efi_item_free( * Freeing the efi requires that we remove it from the AIL if it has already * been placed there. However, the EFI may not yet have been placed in the AIL * when called by xfs_efi_release() from EFD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the - * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees - * the EFI. + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the EFI. */ STATIC void __xfs_efi_release( @@ -60,7 +59,7 @@ __xfs_efi_release( { struct xfs_ail *ailp = efip->efi_item.li_ailp; - if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) { + if (atomic_dec_and_test(&efip->efi_refcount)) { spin_lock(&ailp->xa_lock); /* xfs_trans_ail_delete() drops the AIL lock. */ xfs_trans_ail_delete(ailp, &efip->efi_item, @@ -126,8 +125,8 @@ xfs_efi_item_pin( * which the EFI is manipulated during a transaction. If we are being asked to * remove the EFI it's because the transaction has been cancelled and by * definition that means the EFI cannot be in the AIL so remove it from the - * transaction and free it. Otherwise coordinate with xfs_efi_release() (via - * XFS_EFI_COMMITTED) to determine who gets to free the EFI. + * transaction and free it. Otherwise coordinate with xfs_efi_release() + * to determine who gets to free the EFI. */ STATIC void xfs_efi_item_unpin( @@ -171,19 +170,13 @@ xfs_efi_item_unlock( /* * The EFI is logged only once and cannot be moved in the log, so simply return - * the lsn at which it's been logged. For bulk transaction committed - * processing, the EFI may be processed but not yet unpinned prior to the EFD - * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected - * when processing the EFD. + * the lsn at which it's been logged. */ STATIC xfs_lsn_t xfs_efi_item_committed( struct xfs_log_item *lip, xfs_lsn_t lsn) { - struct xfs_efi_log_item *efip = EFI_ITEM(lip); - - set_bit(XFS_EFI_COMMITTED, &efip->efi_flags); return lsn; } @@ -241,6 +234,7 @@ xfs_efi_init( efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (__psint_t)(void*)efip; atomic_set(&efip->efi_next_extent, 0); + atomic_set(&efip->efi_refcount, 2); return efip; } @@ -310,8 +304,14 @@ xfs_efi_release(xfs_efi_log_item_t *efip, uint nextents) { ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); - if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) + if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { + /* recovery needs us to drop the EFI reference, too */ + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) + __xfs_efi_release(efip); + __xfs_efi_release(efip); + /* efip may now have been freed, do not reference it again. */ + } } static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 375f68e42531..432222418c56 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -114,16 +114,20 @@ typedef struct xfs_efd_log_format_64 { * Define EFI flag bits. Manipulated by set/clear/test_bit operators. */ #define XFS_EFI_RECOVERED 1 -#define XFS_EFI_COMMITTED 2 /* - * This is the "extent free intention" log item. It is used - * to log the fact that some extents need to be free. It is - * used in conjunction with the "extent free done" log item - * described below. + * This is the "extent free intention" log item. It is used to log the fact + * that some extents need to be free. It is used in conjunction with the + * "extent free done" log item described below. + * + * The EFI is reference counted so that it is not freed prior to both the EFI + * and EFD being committed and unpinned. This ensures that when the last + * reference goes away the EFI will always be in the AIL as it has been + * unpinned, regardless of whether the EFD is processed before or after the EFI. */ typedef struct xfs_efi_log_item { xfs_log_item_t efi_item; + atomic_t efi_refcount; atomic_t efi_next_extent; unsigned long efi_flags; /* misc flags */ xfs_efi_log_format_t efi_format; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f03bf1a456fb..de3dc98f4e8f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -36,6 +36,7 @@ #include "xfs_ioctl.h" #include "xfs_trace.h" +#include <linux/aio.h> #include <linux/dcache.h> #include <linux/falloc.h> #include <linux/pagevec.h> @@ -775,8 +776,6 @@ xfs_file_aio_write( if (ocount == 0) return 0; - sb_start_write(inode->i_sb); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ret = -EIO; goto out; @@ -800,7 +799,6 @@ xfs_file_aio_write( } out: - sb_end_write(inode->i_sb); return ret; } @@ -893,7 +891,7 @@ xfs_dir_open( */ mode = xfs_ilock_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_dir2_data_readahead(NULL, ip, 0, -1); + xfs_dir3_data_readahead(NULL, ip, 0, -1); xfs_iunlock(ip, mode); return 0; } @@ -908,11 +906,10 @@ xfs_file_release( STATIC int xfs_file_readdir( - struct file *filp, - void *dirent, - filldir_t filldir) + struct file *file, + struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); xfs_inode_t *ip = XFS_I(inode); int error; size_t bufsize; @@ -931,8 +928,7 @@ xfs_file_readdir( */ bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); - error = xfs_readdir(ip, dirent, bufsize, - (xfs_off_t *)&filp->f_pos, filldir); + error = xfs_readdir(ip, ctx, bufsize); if (error) return -error; return 0; @@ -1272,8 +1268,7 @@ xfs_seek_data( } out: - if (offset != file->f_pos) - file->f_pos = offset; + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out_unlock: xfs_iunlock_map_shared(ip, lock); @@ -1381,8 +1376,7 @@ out: * situation in particular. */ offset = min_t(loff_t, offset, isize); - if (offset != file->f_pos) - file->f_pos = offset; + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out_unlock: xfs_iunlock_map_shared(ip, lock); @@ -1434,7 +1428,7 @@ const struct file_operations xfs_file_operations = { const struct file_operations xfs_dir_file_operations = { .open = xfs_dir_open, .read = generic_read_dir, - .readdir = xfs_file_readdir, + .iterate = xfs_file_readdir, .llseek = generic_file_llseek, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 6dda3f949b04..d04695545397 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ #define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ #define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ +#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ /* diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 2866b8c78b7a..614eb0cc3608 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -99,7 +99,9 @@ xfs_fs_geometry( (xfs_sb_version_hasattr2(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_PROJID32 : 0); + XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) | + (xfs_sb_version_hascrc(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_V5SB : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -174,7 +176,7 @@ xfs_growfs_data_private( if (!bp) return EIO; if (bp->b_error) { - int error = bp->b_error; + error = bp->b_error; xfs_buf_relse(bp); return error; } @@ -247,6 +249,9 @@ xfs_growfs_data_private( tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -265,6 +270,11 @@ xfs_growfs_data_private( } agfl = XFS_BUF_TO_AGFL(bp); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); + agfl->agfl_seqno = cpu_to_be32(agno); + uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); + } for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); @@ -296,8 +306,11 @@ xfs_growfs_data_private( agi->agi_freecount = 0; agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -316,7 +329,13 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, + agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -339,7 +358,13 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, + agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -363,7 +388,12 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, + agno, 0); error = xfs_bwrite(bp); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 515bf71ce01c..7a0c17d7ec09 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -36,6 +36,9 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_bmap.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" +#include "xfs_icreate_item.h" /* @@ -148,12 +151,16 @@ xfs_check_agi_freecount( #endif /* - * Initialise a new set of inodes. + * Initialise a new set of inodes. When called without a transaction context + * (e.g. from recovery) we initiate a delayed write of the inode buffers rather + * than logging them (which in a transaction context puts them into the AIL + * for writeback rather than the xfsbufd queue). */ -STATIC int +int xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, + struct list_head *buffer_list, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, @@ -165,6 +172,7 @@ xfs_ialloc_inode_init( int version; int i, j; xfs_daddr_t d; + xfs_ino_t ino = 0; /* * Loop over the new block(s), filling in the inodes. @@ -183,13 +191,41 @@ xfs_ialloc_inode_init( } /* - * Figure out what version number to use in the inodes we create. - * If the superblock version has caught up to the one that supports - * the new inode format, then use the new inode version. Otherwise - * use the old version so that old kernels will continue to be - * able to use the file system. + * Figure out what version number to use in the inodes we create. If + * the superblock version has caught up to the one that supports the new + * inode format, then use the new inode version. Otherwise use the old + * version so that old kernels will continue to be able to use the file + * system. + * + * For v3 inodes, we also need to write the inode number into the inode, + * so calculate the first inode number of the chunk here as + * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not + * across multiple filesystem blocks (such as a cluster) and so cannot + * be used in the cluster buffer loop below. + * + * Further, because we are writing the inode directly into the buffer + * and calculating a CRC on the entire inode, we have ot log the entire + * inode so that the entire range the CRC covers is present in the log. + * That means for v3 inode we log the entire buffer rather than just the + * inode cores. */ - if (xfs_sb_version_hasnlink(&mp->m_sb)) + if (xfs_sb_version_hascrc(&mp->m_sb)) { + version = 3; + ino = XFS_AGINO_TO_INO(mp, agno, + XFS_OFFBNO_TO_AGINO(mp, agbno, 0)); + + /* + * log the initialisation that is about to take place as an + * logical operation. This means the transaction does not + * need to log the physical changes to the inode buffers as log + * recovery will know what initialisation is actually needed. + * Hence we only need to log the buffers as "ordered" buffers so + * they track in the AIL as if they were physically logged. + */ + if (tp) + xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp), + mp->m_sb.sb_inodesize, length, gen); + } else if (xfs_sb_version_hasnlink(&mp->m_sb)) version = 2; else version = 1; @@ -204,27 +240,58 @@ xfs_ialloc_inode_init( XBF_UNMAPPED); if (!fbuf) return ENOMEM; - /* - * Initialize all inodes in this buffer and then log them. - * - * XXX: It would be much better if we had just one transaction - * to log a whole cluster of inodes instead of all the - * individual transactions causing a lot of log traffic. - */ + + /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; - xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); + xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < ninodes; i++) { int ioffset = i << mp->m_sb.sb_inodelog; - uint isize = sizeof(struct xfs_dinode); + uint isize = xfs_dinode_size(version); free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); free->di_version = version; free->di_gen = cpu_to_be32(gen); free->di_next_unlinked = cpu_to_be32(NULLAGINO); - xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1); + + if (version == 3) { + free->di_ino = cpu_to_be64(ino); + ino++; + uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid); + xfs_dinode_calc_crc(mp, free); + } else if (tp) { + /* just log the inode core */ + xfs_trans_log_buf(tp, fbuf, ioffset, + ioffset + isize - 1); + } + } + + if (tp) { + /* + * Mark the buffer as an inode allocation buffer so it + * sticks in AIL at the point of this allocation + * transaction. This ensures the they are on disk before + * the tail of the log can be moved past this + * transaction (i.e. by preventing relogging from moving + * it forward in the log). + */ + xfs_trans_inode_alloc_buf(tp, fbuf); + if (version == 3) { + /* + * Mark the buffer as ordered so that they are + * not physically logged in the transaction but + * still tracked in the AIL as part of the + * transaction and pin the log appropriately. + */ + xfs_trans_ordered_buf(tp, fbuf); + xfs_trans_log_buf(tp, fbuf, 0, + BBTOB(fbuf->b_length) - 1); + } + } else { + fbuf->b_flags |= XBF_DONE; + xfs_buf_delwri_queue(fbuf, buffer_list); + xfs_buf_relse(fbuf); } - xfs_trans_inode_alloc_buf(tp, fbuf); } return 0; } @@ -269,7 +336,7 @@ xfs_ialloc_ag_alloc( * First try to allocate inodes contiguous with the last-allocated * chunk of inodes. If the filesystem is striped, this will fill * an entire stripe unit with inodes. - */ + */ agi = XFS_BUF_TO_AGI(agbp); newino = be32_to_cpu(agi->agi_newino); agno = be32_to_cpu(agi->agi_seqno); @@ -368,8 +435,8 @@ xfs_ialloc_ag_alloc( * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, - args.len, random32()); + error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno, + args.len, prandom_u32()); if (error) return error; @@ -581,8 +648,7 @@ xfs_ialloc_get_rec( struct xfs_btree_cur *cur, xfs_agino_t agino, xfs_inobt_rec_incore_t *rec, - int *done, - int left) + int *done) { int error; int i; @@ -690,12 +756,12 @@ xfs_dialloc_ag( pag->pagl_leftrec != NULLAGINO && pag->pagl_rightrec != NULLAGINO) { error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec, - &trec, &doneleft, 1); + &trec, &doneleft); if (error) goto error1; error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec, - &rec, &doneright, 0); + &rec, &doneright); if (error) goto error1; } else { @@ -1453,6 +1519,7 @@ xfs_ialloc_log_agi( /* * Log the allocation group inode header buffer. */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, bp, first, last); } @@ -1470,19 +1537,23 @@ xfs_check_agi_unlinked( #define xfs_check_agi_unlinked(agi) #endif -static void +static bool xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); - int agi_ok; + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid)) + return false; /* * Validate the magic number of the agi block. */ - agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) && - XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); + if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) + return false; + if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) + return false; /* * during growfs operations, the perag is not fully initialised, @@ -1490,30 +1561,52 @@ xfs_agi_verify( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag) - agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) == - bp->b_pag->pag_agno; + if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno) + return false; - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, - XFS_RANDOM_IALLOC_READ_AGI))) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } xfs_check_agi_unlinked(agi); + return true; } static void xfs_agi_read_verify( struct xfs_buf *bp) { - xfs_agi_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + int agi_ok = 1; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + agi_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agi, agi_crc)); + agi_ok = agi_ok && xfs_agi_verify(bp); + + if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void xfs_agi_write_verify( struct xfs_buf *bp) { - xfs_agi_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_agi_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agi, agi_crc)); } const struct xfs_buf_ops xfs_agi_buf_ops = { diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index c8da3df271e6..68c07320f096 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -150,6 +150,14 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); +/* + * Inode chunk initialisation routine + */ +int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, + struct list_head *buffer_list, + xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_agblock_t length, unsigned int gen); + extern const struct xfs_buf_ops xfs_agi_buf_ops; #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index bec344b36507..5448eb6b8c12 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -34,6 +34,7 @@ #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" STATIC int @@ -182,52 +183,88 @@ xfs_inobt_key_diff( cur->bc_rec.i.ir_startino; } -void +static int xfs_inobt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; unsigned int level; - int sblock_ok; /* block passes checks */ - /* magic number and level verification */ - level = be16_to_cpu(block->bb_level); - sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) && - level < mp->m_in_maxlevels; + /* + * During growfs operations, we can't verify the exact owner as the + * perag is not fully initialised and hence not attached to the buffer. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agi information will not yet have been initialised + * from the on disk AGI. We don't currently use any of this information, + * but beware of the landmine (i.e. need to check pag->pagi_init) if we + * ever do. + */ + switch (block->bb_magic) { + case cpu_to_be32(XFS_IBT_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ + case cpu_to_be32(XFS_IBT_MAGIC): + break; + default: + return 0; + } - /* numrecs verification */ - sblock_ok = sblock_ok && - be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0]; + /* numrecs and level verification */ + level = be16_to_cpu(block->bb_level); + if (level >= mp->m_in_maxlevels) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0]) + return false; /* sibling pointer verification */ - sblock_ok = sblock_ok && - (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_leftsib && - (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_rightsib; - - if (!sblock_ok) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; } static void xfs_inobt_read_verify( struct xfs_buf *bp) { - xfs_inobt_verify(bp); + if (!(xfs_btree_sblock_verify_crc(bp) && + xfs_inobt_verify(bp))) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void xfs_inobt_write_verify( struct xfs_buf *bp) { - xfs_inobt_verify(bp); + if (!xfs_inobt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + xfs_btree_sblock_calc_crc(bp); + } const struct xfs_buf_ops xfs_inobt_buf_ops = { @@ -235,7 +272,7 @@ const struct xfs_buf_ops xfs_inobt_buf_ops = { .verify_write = xfs_inobt_write_verify, }; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_inobt_keys_inorder( struct xfs_btree_cur *cur, @@ -273,7 +310,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, .buf_ops = &xfs_inobt_buf_ops, -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, #endif @@ -301,6 +338,8 @@ xfs_inobt_init_cursor( cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_inobt_ops; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index 25c0239a8eab..3ac36b7642e9 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -29,7 +29,8 @@ struct xfs_mount; /* * There is a btree for the inode map per allocation group. */ -#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ +#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ +#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */ typedef __uint64_t xfs_inofree_t; #define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) @@ -76,10 +77,10 @@ typedef __be32 xfs_inobt_ptr_t; /* * Btree block header size depends on a superblock flag. - * - * (not quite yet, but soon) */ -#define XFS_INOBT_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN +#define XFS_INOBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* * Record, key, and pointer address macros for btree blocks. diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 96e344e3e927..9560dc1f15a9 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -335,7 +335,8 @@ xfs_iget_cache_miss( iflags = XFS_INEW; if (flags & XFS_IGET_DONTCACHE) iflags |= XFS_IDONTCACHE; - ip->i_udquot = ip->i_gdquot = NULL; + ip->i_udquot = NULL; + ip->i_gdquot = NULL; xfs_iflags_set(ip, iflags); /* insert the new inode */ diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index e0f138c70a2f..a01afbb3909a 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -40,7 +40,6 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); void xfs_eofblocks_worker(struct work_struct *); -int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags, void *args), diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c new file mode 100644 index 000000000000..7716a4e7375e --- /dev/null +++ b/fs/xfs/xfs_icreate_item.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2008-2010, 2013 Dave Chinner + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_trans_priv.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_error.h" +#include "xfs_icreate_item.h" + +kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ + +static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_icreate_item, ic_item); +} + +/* + * This returns the number of iovecs needed to log the given inode item. + * + * We only need one iovec for the icreate log structure. + */ +STATIC uint +xfs_icreate_item_size( + struct xfs_log_item *lip) +{ + return 1; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given inode create log item. + */ +STATIC void +xfs_icreate_item_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *log_vector) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + + log_vector->i_addr = (xfs_caddr_t)&icp->ic_format; + log_vector->i_len = sizeof(struct xfs_icreate_log); + log_vector->i_type = XLOG_REG_TYPE_ICREATE; +} + + +/* Pinning has no meaning for the create item, so just return. */ +STATIC void +xfs_icreate_item_pin( + struct xfs_log_item *lip) +{ +} + + +/* pinning has no meaning for the create item, so just return. */ +STATIC void +xfs_icreate_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +STATIC void +xfs_icreate_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + + if (icp->ic_item.li_flags & XFS_LI_ABORTED) + kmem_zone_free(xfs_icreate_zone, icp); + return; +} + +/* + * Because we have ordered buffers being tracked in the AIL for the inode + * creation, we don't need the create item after this. Hence we can free + * the log item and return -1 to tell the caller we're done with the item. + */ +STATIC xfs_lsn_t +xfs_icreate_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + + kmem_zone_free(xfs_icreate_zone, icp); + return (xfs_lsn_t)-1; +} + +/* item can never get into the AIL */ +STATIC uint +xfs_icreate_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + ASSERT(0); + return XFS_ITEM_SUCCESS; +} + +/* Ordered buffers do the dependency tracking here, so this does nothing. */ +STATIC void +xfs_icreate_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all buf log items. + */ +static struct xfs_item_ops xfs_icreate_item_ops = { + .iop_size = xfs_icreate_item_size, + .iop_format = xfs_icreate_item_format, + .iop_pin = xfs_icreate_item_pin, + .iop_unpin = xfs_icreate_item_unpin, + .iop_push = xfs_icreate_item_push, + .iop_unlock = xfs_icreate_item_unlock, + .iop_committed = xfs_icreate_item_committed, + .iop_committing = xfs_icreate_item_committing, +}; + + +/* + * Initialize the inode log item for a newly allocated (in-core) inode. + * + * Inode extents can only reside within an AG. Hence specify the starting + * block for the inode chunk by offset within an AG as well as the + * length of the allocated extent. + * + * This joins the item to the transaction and marks it dirty so + * that we don't need a separate call to do this, nor does the + * caller need to know anything about the icreate item. + */ +void +xfs_icreate_log( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + unsigned int count, + unsigned int inode_size, + xfs_agblock_t length, + unsigned int generation) +{ + struct xfs_icreate_item *icp; + + icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP); + + xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, + &xfs_icreate_item_ops); + + icp->ic_format.icl_type = XFS_LI_ICREATE; + icp->ic_format.icl_size = 1; /* single vector */ + icp->ic_format.icl_ag = cpu_to_be32(agno); + icp->ic_format.icl_agbno = cpu_to_be32(agbno); + icp->ic_format.icl_count = cpu_to_be32(count); + icp->ic_format.icl_isize = cpu_to_be32(inode_size); + icp->ic_format.icl_length = cpu_to_be32(length); + icp->ic_format.icl_gen = cpu_to_be32(generation); + + xfs_trans_add_item(tp, &icp->ic_item); + tp->t_flags |= XFS_TRANS_DIRTY; + icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h new file mode 100644 index 000000000000..88ba8aa0bc41 --- /dev/null +++ b/fs/xfs/xfs_icreate_item.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2008-2010, Dave Chinner + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef XFS_ICREATE_ITEM_H +#define XFS_ICREATE_ITEM_H 1 + +/* + * on disk log item structure + * + * Log recovery assumes the first two entries are the type and size and they fit + * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so + * decoding can be done correctly. + */ +struct xfs_icreate_log { + __uint16_t icl_type; /* type of log format structure */ + __uint16_t icl_size; /* size of log format structure */ + __be32 icl_ag; /* ag being allocated in */ + __be32 icl_agbno; /* start block of inode range */ + __be32 icl_count; /* number of inodes to initialise */ + __be32 icl_isize; /* size of inodes */ + __be32 icl_length; /* length of extent to initialise */ + __be32 icl_gen; /* inode generation number to use */ +}; + +/* in memory log item structure */ +struct xfs_icreate_item { + struct xfs_log_item ic_item; + struct xfs_icreate_log ic_format; +}; + +extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ + +void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t agbno, unsigned int count, + unsigned int inode_size, xfs_agblock_t length, + unsigned int generation); + +#endif /* XFS_ICREATE_ITEM_H */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4f201656d2d9..9ecfe1e559fc 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -44,6 +44,7 @@ #include "xfs_quota.h" #include "xfs_filestream.h" #include "xfs_vnodeops.h" +#include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -286,7 +287,7 @@ xfs_ilock_demote( trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); } -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int xfs_isilocked( xfs_inode_t *ip, @@ -786,6 +787,7 @@ xfs_iformat_btree( xfs_dinode_t *dip, int whichfork) { + struct xfs_mount *mp = ip->i_mount; xfs_bmdr_block_t *dfp; xfs_ifork_t *ifp; /* REFERENCED */ @@ -794,7 +796,7 @@ xfs_iformat_btree( ifp = XFS_IFORK_PTR(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); - size = XFS_BMAP_BROOT_SPACE(dfp); + size = XFS_BMAP_BROOT_SPACE(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); /* @@ -805,14 +807,14 @@ xfs_iformat_btree( * blocks. */ if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork) || + XFS_IFORK_MAXEXT(ip, whichfork) || XFS_BMDR_SPACE_CALC(nrecs) > - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) || + XFS_DFORK_SIZE(dip, mp, whichfork) || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { - xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", - (unsigned long long) ip->i_ino); + xfs_warn(mp, "corrupt inode %Lu (btree).", + (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); + mp, dip); return XFS_ERROR(EFSCORRUPTED); } @@ -823,8 +825,7 @@ xfs_iformat_btree( * Copy and convert from the on-disk structure * to the in-memory structure. */ - xfs_bmdr_to_bmbt(ip->i_mount, dfp, - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), + xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), ifp->if_broot, size); ifp->if_flags &= ~XFS_IFEXTENTS; ifp->if_flags |= XFS_IFBROOT; @@ -866,6 +867,17 @@ xfs_dinode_from_disk( to->di_dmstate = be16_to_cpu(from->di_dmstate); to->di_flags = be16_to_cpu(from->di_flags); to->di_gen = be32_to_cpu(from->di_gen); + + if (to->di_version == 3) { + to->di_changecount = be64_to_cpu(from->di_changecount); + to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); + to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); + to->di_flags2 = be64_to_cpu(from->di_flags2); + to->di_ino = be64_to_cpu(from->di_ino); + to->di_lsn = be64_to_cpu(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + } } void @@ -902,6 +914,17 @@ xfs_dinode_to_disk( to->di_dmstate = cpu_to_be16(from->di_dmstate); to->di_flags = cpu_to_be16(from->di_flags); to->di_gen = cpu_to_be32(from->di_gen); + + if (from->di_version == 3) { + to->di_changecount = cpu_to_be64(from->di_changecount); + to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); + to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); + to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_ino = cpu_to_be64(from->di_ino); + to->di_lsn = cpu_to_be64(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + } } STATIC uint @@ -962,8 +985,54 @@ xfs_dic2xflags( (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); } +static bool +xfs_dinode_verify( + struct xfs_mount *mp, + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) + return false; + + /* only version 3 or greater inodes are extensively verified here */ + if (dip->di_version < 3) + return true; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + offsetof(struct xfs_dinode, di_crc))) + return false; + if (be64_to_cpu(dip->di_ino) != ip->i_ino) + return false; + if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid)) + return false; + return true; +} + +void +xfs_dinode_calc_crc( + struct xfs_mount *mp, + struct xfs_dinode *dip) +{ + __uint32_t crc; + + if (dip->di_version < 3) + return; + + ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); + crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, + offsetof(struct xfs_dinode, di_crc)); + dip->di_crc = xfs_end_cksum(crc); +} + /* * Read the disk inode attributes into the in-core inode structure. + * + * If we are initialising a new inode and we are not utilising the + * XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new inode core + * with a random generation number. If we are keeping inodes around, we need to + * read the inode cluster to get the existing generation number off disk. */ int xfs_iread( @@ -983,6 +1052,22 @@ xfs_iread( if (error) return error; + /* shortcut IO on inode allocation if possible */ + if ((iget_flags & XFS_IGET_CREATE) && + !(mp->m_flags & XFS_MOUNT_IKEEP)) { + /* initialise the on-disk inode core */ + memset(&ip->i_d, 0, sizeof(ip->i_d)); + ip->i_d.di_magic = XFS_DINODE_MAGIC; + ip->i_d.di_gen = prandom_u32(); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ip->i_d.di_version = 3; + ip->i_d.di_ino = ip->i_ino; + uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid); + } else + ip->i_d.di_version = 2; + return 0; + } + /* * Get pointers to the on-disk inode and the buffer containing it. */ @@ -990,17 +1075,13 @@ xfs_iread( if (error) return error; - /* - * If we got something that isn't an inode it means someone - * (nfs or dmi) has a stale handle. - */ - if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) { -#ifdef DEBUG - xfs_alert(mp, - "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)", - __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC); -#endif /* DEBUG */ - error = XFS_ERROR(EINVAL); + /* even unallocated inodes are verified */ + if (!xfs_dinode_verify(mp, ip, dip)) { + xfs_alert(mp, "%s: validation failed for inode %lld failed", + __func__, ip->i_ino); + + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); + error = XFS_ERROR(EFSCORRUPTED); goto out_brelse; } @@ -1022,10 +1103,20 @@ xfs_iread( goto out_brelse; } } else { + /* + * Partial initialisation of the in-core inode. Just the bits + * that xfs_ialloc won't overwrite or relies on being correct. + */ ip->i_d.di_magic = be16_to_cpu(dip->di_magic); ip->i_d.di_version = dip->di_version; ip->i_d.di_gen = be32_to_cpu(dip->di_gen); ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); + + if (dip->di_version == 3) { + ip->i_d.di_ino = be64_to_cpu(dip->di_ino); + uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid); + } + /* * Make sure to pull in the mode here as well in * case the inode is released without being used. @@ -1063,17 +1154,16 @@ xfs_iread( xfs_buf_set_ref(bp, XFS_INO_REF); /* - * Use xfs_trans_brelse() to release the buffer containing the - * on-disk inode, because it was acquired with xfs_trans_read_buf() - * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal + * Use xfs_trans_brelse() to release the buffer containing the on-disk + * inode, because it was acquired with xfs_trans_read_buf() in + * xfs_imap_to_bp() above. If tp is NULL, this is just a normal * brelse(). If we're within a transaction, then xfs_trans_brelse() * will only release the buffer if it is not dirty within the * transaction. It will be OK to release the buffer in this case, - * because inodes on disk are never destroyed and we will be - * locking the new in-core inode before putting it in the hash - * table where other processes can find it. Thus we don't have - * to worry about the inode being changed just because we released - * the buffer. + * because inodes on disk are never destroyed and we will be locking the + * new in-core inode before putting it in the cache where other + * processes can find it. Thus we don't have to worry about the inode + * being changed just because we released the buffer. */ out_brelse: xfs_trans_brelse(tp, bp); @@ -1161,6 +1251,7 @@ xfs_ialloc( xfs_buf_t **ialloc_context, xfs_inode_t **ipp) { + struct xfs_mount *mp = tp->t_mountp; xfs_ino_t ino; xfs_inode_t *ip; uint flags; @@ -1187,7 +1278,7 @@ xfs_ialloc( * This is because we're setting fields here we need * to prevent others from looking at until we're done. */ - error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, + error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); if (error) return error; @@ -1208,7 +1299,7 @@ xfs_ialloc( * the inode version number now. This way we only do the conversion * here rather than here and in the flush/logging code. */ - if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) && + if (xfs_sb_version_hasnlink(&mp->m_sb) && ip->i_d.di_version == 1) { ip->i_d.di_version = 2; /* @@ -1258,6 +1349,19 @@ xfs_ialloc( ip->i_d.di_dmevmask = 0; ip->i_d.di_dmstate = 0; ip->i_d.di_flags = 0; + + if (ip->i_d.di_version == 3) { + ASSERT(ip->i_d.di_ino == ino); + ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid)); + ip->i_d.di_crc = 0; + ip->i_d.di_changecount = 1; + ip->i_d.di_lsn = 0; + ip->i_d.di_flags2 = 0; + memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2)); + ip->i_d.di_crtime = ip->i_d.di_mtime; + } + + flags = XFS_ILOG_CORE; switch (mode & S_IFMT) { case S_IFIFO: @@ -1554,6 +1658,10 @@ xfs_iunlink( dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1639,6 +1747,10 @@ xfs_iunlink_remove( dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1712,6 +1824,10 @@ xfs_iunlink_remove( dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1725,6 +1841,10 @@ xfs_iunlink_remove( last_dip->di_next_unlinked = cpu_to_be32(next_agino); ASSERT(next_agino != 0); offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, last_dip); + xfs_trans_inode_buf(tp, last_ibp); xfs_trans_log_buf(tp, last_ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1928,8 +2048,6 @@ xfs_ifree( int error; int delete; xfs_ino_t first_ino; - xfs_dinode_t *dip; - xfs_buf_t *ibp; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_d.di_nlink == 0); @@ -1942,14 +2060,13 @@ xfs_ifree( * Pull the on-disk inode from the AGI unlinked list. */ error = xfs_iunlink_remove(tp, ip); - if (error != 0) { + if (error) return error; - } error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); - if (error != 0) { + if (error) return error; - } + ip->i_d.di_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0; @@ -1961,31 +2078,10 @@ xfs_ifree( * by reincarnations of this inode. */ ip->i_d.di_gen++; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) - return error; - - /* - * Clear the on-disk di_mode. This is to prevent xfs_bulkstat - * from picking up this inode when it is reclaimed (its incore state - * initialzed but not flushed to disk yet). The in-core di_mode is - * already cleared and a corresponding transaction logged. - * The hack here just synchronizes the in-core to on-disk - * di_mode value in advance before the actual inode sync to disk. - * This is OK because the inode is already unlinked and would never - * change its di_mode again for this inode generation. - * This is a temporary hack that would require a proper fix - * in the future. - */ - dip->di_mode = 0; - - if (delete) { + if (delete) error = xfs_ifree_cluster(ip, tp, first_ino); - } return error; } @@ -2037,7 +2133,7 @@ xfs_iroot_realloc( * allocate it now and get out. */ if (ifp->if_broot_bytes == 0) { - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); ifp->if_broot_bytes = (int)new_size; return; @@ -2051,9 +2147,9 @@ xfs_iroot_realloc( */ cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); new_max = cur_max + rec_diff; - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ + XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), KM_SLEEP | KM_NOFS); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); @@ -2061,7 +2157,7 @@ xfs_iroot_realloc( (int)new_size); ifp->if_broot_bytes = (int)new_size; ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); + XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); return; } @@ -2076,7 +2172,7 @@ xfs_iroot_realloc( new_max = cur_max + rec_diff; ASSERT(new_max >= 0); if (new_max > 0) - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); else new_size = 0; if (new_size > 0) { @@ -2084,7 +2180,8 @@ xfs_iroot_realloc( /* * First copy over the btree block header. */ - memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN); + memcpy(new_broot, ifp->if_broot, + XFS_BMBT_BLOCK_LEN(ip->i_mount)); } else { new_broot = NULL; ifp->if_flags &= ~XFS_IFBROOT; @@ -2114,7 +2211,7 @@ xfs_iroot_realloc( ifp->if_broot = new_broot; ifp->if_broot_bytes = (int)new_size; ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); + XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); return; } @@ -2427,7 +2524,7 @@ xfs_iflush_fork( ASSERT(ifp->if_broot != NULL); ASSERT(ifp->if_broot_bytes <= (XFS_IFORK_SIZE(ip, whichfork) + - XFS_BROOT_SIZE_ADJ)); + XFS_BROOT_SIZE_ADJ(ip))); xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, (xfs_bmdr_block_t *)cp, XFS_DFORK_SIZE(dip, mp, whichfork)); @@ -2715,20 +2812,18 @@ abort_out: STATIC int xfs_iflush_int( - xfs_inode_t *ip, - xfs_buf_t *bp) + struct xfs_inode *ip, + struct xfs_buf *bp) { - xfs_inode_log_item_t *iip; - xfs_dinode_t *dip; - xfs_mount_t *mp; + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_dinode *dip; + struct xfs_mount *mp = ip->i_mount; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - - iip = ip->i_itemp; - mp = ip->i_mount; + ASSERT(iip != NULL && iip->ili_fields != 0); /* set *dip = inode's place in the buffer */ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -2789,9 +2884,9 @@ xfs_iflush_int( } /* * bump the flush iteration count, used to detect flushes which - * postdate a log record during recovery. + * postdate a log record during recovery. This is redundant as we now + * log every change and hence this can't happen. Still, it doesn't hurt. */ - ip->i_d.di_flushiter++; /* @@ -2867,41 +2962,30 @@ xfs_iflush_int( * need the AIL lock, because it is a 64 bit value that cannot be read * atomically. */ - if (iip != NULL && iip->ili_fields != 0) { - iip->ili_last_fields = iip->ili_fields; - iip->ili_fields = 0; - iip->ili_logged = 1; + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_logged = 1; - xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); + xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); - /* - * Attach the function xfs_iflush_done to the inode's - * buffer. This will remove the inode from the AIL - * and unlock the inode's flush lock when the inode is - * completely written to disk. - */ - xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); + /* + * Attach the function xfs_iflush_done to the inode's + * buffer. This will remove the inode from the AIL + * and unlock the inode's flush lock when the inode is + * completely written to disk. + */ + xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); - ASSERT(bp->b_fspriv != NULL); - ASSERT(bp->b_iodone != NULL); - } else { - /* - * We're flushing an inode which is not in the AIL and has - * not been logged. For this case we can immediately drop - * the inode flush lock because we can avoid the whole - * AIL state thing. It's OK to drop the flush lock now, - * because we've already locked the buffer and to do anything - * you really need both. - */ - if (iip != NULL) { - ASSERT(iip->ili_logged == 0); - ASSERT(iip->ili_last_fields == 0); - ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); - } - xfs_ifunlock(ip); - } + /* update the lsn in the on disk inode if required */ + if (ip->i_d.di_version == 3) + dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn); + + /* generate the checksum. */ + xfs_dinode_calc_crc(mp, dip); + ASSERT(bp->b_fspriv != NULL); + ASSERT(bp->b_iodone != NULL); return 0; corrupt_out: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 237e7f6f2ab3..91129794aaec 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -150,13 +150,38 @@ typedef struct xfs_icdinode { __uint16_t di_dmstate; /* DMIG state info */ __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ __uint32_t di_gen; /* generation number */ + + /* di_next_unlinked is the only non-core field in the old dinode */ + xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ + + /* start of the extended dinode, writable fields */ + __uint32_t di_crc; /* CRC of the inode */ + __uint64_t di_changecount; /* number of attribute changes */ + xfs_lsn_t di_lsn; /* flush sequence */ + __uint64_t di_flags2; /* more random flags */ + __uint8_t di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_ictimestamp_t di_crtime; /* time created */ + xfs_ino_t di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ } xfs_icdinode_t; +static inline uint xfs_icdinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_icdinode); + return offsetof(struct xfs_icdinode, di_next_unlinked); +} + /* * Flags for xfs_ichgtime(). */ #define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ #define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ +#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ /* * Per-fork incore inode flags. @@ -180,10 +205,11 @@ typedef struct xfs_icdinode { #define XFS_IFORK_DSIZE(ip) \ (XFS_IFORK_Q(ip) ? \ XFS_IFORK_BOFF(ip) : \ - XFS_LITINO((ip)->i_mount)) + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version)) #define XFS_IFORK_ASIZE(ip) \ (XFS_IFORK_Q(ip) ? \ - XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \ + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \ + XFS_IFORK_BOFF(ip) : \ 0) #define XFS_IFORK_SIZE(ip,w) \ ((w) == XFS_DATA_FORK ? \ @@ -555,6 +581,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); +void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index f034bd1652f0..f76ff52e43c0 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -179,7 +179,7 @@ xfs_inode_item_format( nvecs = 1; vecp->i_addr = &ip->i_d; - vecp->i_len = sizeof(struct xfs_icdinode); + vecp->i_len = xfs_icdinode_size(ip->i_d.di_version); vecp->i_type = XLOG_REG_TYPE_ICORE; vecp++; nvecs++; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d681e34c2950..5e999680094a 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -422,9 +422,12 @@ xfs_attrlist_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL); - if (!kbuf) - goto out_dput; + kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL); + if (!kbuf) { + kbuf = kmem_zalloc_large(al_hreq.buflen); + if (!kbuf) + goto out_dput; + } cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, @@ -436,7 +439,10 @@ xfs_attrlist_by_handle( error = -EFAULT; out_kfree: - kfree(kbuf); + if (is_vmalloc_addr(kbuf)) + kmem_free_large(kbuf); + else + kmem_free(kbuf); out_dput: dput(dentry); return error; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 63b8fc432151..c0c66259cc91 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -373,9 +373,12 @@ xfs_compat_attrlist_by_handle( return PTR_ERR(dentry); error = -ENOMEM; - kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); - if (!kbuf) - goto out_dput; + kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL); + if (!kbuf) { + kbuf = kmem_zalloc_large(al_hreq.buflen); + if (!kbuf) + goto out_dput; + } cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, @@ -387,7 +390,10 @@ xfs_compat_attrlist_by_handle( error = -EFAULT; out_kfree: - kfree(kbuf); + if (is_vmalloc_addr(kbuf)) + kmem_free_large(kbuf); + else + kmem_free(kbuf); out_dput: dput(dentry); return error; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 5a30dd899d2b..6a7096422295 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -42,6 +42,8 @@ #include "xfs_iomap.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -282,6 +284,15 @@ xfs_iomap_eof_want_preallocate( return 0; /* + * If the file is smaller than the minimum prealloc and we are using + * dynamic preallocation, don't do any preallocation at all as it is + * likely this is the only write to the file that is going to be done. + */ + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && + XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)) + return 0; + + /* * If there are any real blocks past eof, then don't * do any speculative allocation. */ @@ -343,6 +354,10 @@ xfs_iomap_eof_prealloc_initial_size( if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) return 0; + /* If the file is small, then use the minimum prealloc */ + if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign)) + return 0; + /* * As we write multiple pages, the offset will always align to the * start of a page and hence point to a hole at EOF. i.e. if the size is @@ -362,10 +377,65 @@ xfs_iomap_eof_prealloc_initial_size( if (imap[0].br_startblock == HOLESTARTBLOCK) return 0; if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) - return imap[0].br_blockcount; + return imap[0].br_blockcount << 1; return XFS_B_TO_FSB(mp, offset); } +STATIC bool +xfs_quota_need_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t alloc_blocks) +{ + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + if (!dq || !xfs_this_quota_on(ip->i_mount, type)) + return false; + + /* no hi watermark, no throttle */ + if (!dq->q_prealloc_hi_wmark) + return false; + + /* under the lo watermark, no throttle */ + if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark) + return false; + + return true; +} + +STATIC void +xfs_quota_calc_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t *qblocks, + int *qshift) +{ + int64_t freesp; + int shift = 0; + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + /* over hi wmark, squash the prealloc completely */ + if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { + *qblocks = 0; + return; + } + + freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount; + if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) { + shift = 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT]) + shift += 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT]) + shift += 2; + } + + /* only overwrite the throttle values if we are more aggressive */ + if ((freesp >> shift) < (*qblocks >> *qshift)) { + *qblocks = freesp; + *qshift = shift; + } +} + /* * If we don't have a user specified preallocation size, dynamically increase * the preallocation size as the size of the file grows. Cap the maximum size @@ -381,45 +451,89 @@ xfs_iomap_prealloc_size( int nimaps) { xfs_fsblock_t alloc_blocks = 0; + int shift = 0; + int64_t freesp; + xfs_fsblock_t qblocks; + int qshift = 0; alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, imap, nimaps); - if (alloc_blocks > 0) { - int shift = 0; - int64_t freesp; - - alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, - rounddown_pow_of_two(alloc_blocks)); - - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); - freesp = mp->m_sb.sb_fdblocks; - if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { - shift = 2; - if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) - shift++; - } - if (shift) - alloc_blocks >>= shift; + if (!alloc_blocks) + goto check_writeio; + qblocks = alloc_blocks; - /* - * If we are still trying to allocate more space than is - * available, squash the prealloc hard. This can happen if we - * have a large file on a small filesystem and the above - * lowspace thresholds are smaller than MAXEXTLEN. - */ - while (alloc_blocks && alloc_blocks >= freesp) - alloc_blocks >>= 4; + /* + * MAXEXTLEN is not a power of two value but we round the prealloc down + * to the nearest power of two value after throttling. To prevent the + * round down from unconditionally reducing the maximum supported prealloc + * size, we round up first, apply appropriate throttling, round down and + * cap the value to MAXEXTLEN. + */ + alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), + alloc_blocks); + + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + freesp = mp->m_sb.sb_fdblocks; + if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { + shift = 2; + if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) + shift++; } + /* + * Check each quota to cap the prealloc size and provide a shift + * value to throttle with. + */ + if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift); + if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift); + if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift); + + /* + * The final prealloc size is set to the minimum of free space available + * in each of the quotas and the overall filesystem. + * + * The shift throttle value is set to the maximum value as determined by + * the global low free space values and per-quota low free space values. + */ + alloc_blocks = MIN(alloc_blocks, qblocks); + shift = MAX(shift, qshift); + + if (shift) + alloc_blocks >>= shift; + /* + * rounddown_pow_of_two() returns an undefined result if we pass in + * alloc_blocks = 0. + */ + if (alloc_blocks) + alloc_blocks = rounddown_pow_of_two(alloc_blocks); + if (alloc_blocks > MAXEXTLEN) + alloc_blocks = MAXEXTLEN; + + /* + * If we are still trying to allocate more space than is + * available, squash the prealloc hard. This can happen if we + * have a large file on a small filesystem and the above + * lowspace thresholds are smaller than MAXEXTLEN. + */ + while (alloc_blocks && alloc_blocks >= freesp) + alloc_blocks >>= 4; + +check_writeio: if (alloc_blocks < mp->m_writeio_blocks) alloc_blocks = mp->m_writeio_blocks; + trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, + mp->m_writeio_blocks); + return alloc_blocks; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index d82efaa2ac73..c69bbc493cb0 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -455,6 +455,28 @@ xfs_vn_getattr( return 0; } +static void +xfs_setattr_mode( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + umode_t mode = iattr->ia_mode; + + ASSERT(tp); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + + ip->i_d.di_mode &= S_IFMT; + ip->i_d.di_mode |= mode & ~S_IFMT; + + inode->i_mode &= S_IFMT; + inode->i_mode |= mode & ~S_IFMT; +} + int xfs_setattr_nonsize( struct xfs_inode *ip, @@ -606,18 +628,8 @@ xfs_setattr_nonsize( /* * Change file access modes. */ - if (mask & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; - - ip->i_d.di_mode &= S_IFMT; - ip->i_d.di_mode |= mode & ~S_IFMT; - - inode->i_mode &= S_IFMT; - inode->i_mode |= mode & ~S_IFMT; - } + if (mask & ATTR_MODE) + xfs_setattr_mode(tp, ip, iattr); /* * Change file access or modified times. @@ -714,9 +726,8 @@ xfs_setattr_size( return XFS_ERROR(error); ASSERT(S_ISREG(ip->i_d.di_mode)); - ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| - ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| - ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); + ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| + ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); if (!(flags & XFS_ATTR_NOLOCK)) { lock_flags |= XFS_IOLOCK_EXCL; @@ -860,6 +871,12 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } + /* + * Change file access modes. + */ + if (mask & ATTR_MODE) + xfs_setattr_mode(tp, ip, iattr); + if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; @@ -970,7 +987,8 @@ xfs_fiemap_format( if (bmv->bmv_oflags & BMV_OF_PREALLOC) fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { - fiemap_flags |= FIEMAP_EXTENT_DELALLOC; + fiemap_flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); physical = 0; /* no block yet */ } if (bmv->bmv_oflags & BMV_OF_LAST) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 2ea7d402188d..bc92c5306a17 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -43,7 +43,7 @@ xfs_internal_inum( { return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || (xfs_sb_version_hasquota(&mp->m_sb) && - (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))); + xfs_is_quota_inode(&mp->m_sb, ino))); } /* @@ -383,11 +383,13 @@ xfs_bulkstat( * Also start read-ahead now for this chunk. */ if (r.ir_freecount < XFS_INODES_PER_CHUNK) { + struct blk_plug plug; /* * Loop over all clusters in the next chunk. * Do a readahead if there are any allocated * inodes in that cluster. */ + blk_start_plug(&plug); agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK; @@ -399,6 +401,7 @@ xfs_bulkstat( agbno, nbcluster, &xfs_inode_buf_ops); } + blk_finish_plug(&plug); irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index fe7e4df85a7b..800f896a6cc4 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -72,6 +72,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/list_sort.h> +#include <linux/ratelimit.h> #include <asm/page.h> #include <asm/div64.h> @@ -292,22 +293,34 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) #define ASSERT_ALWAYS(expr) \ (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) -#ifndef DEBUG -#define ASSERT(expr) ((void)0) +#ifdef DEBUG +#define ASSERT(expr) \ + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) #ifndef STATIC -# define STATIC static noinline +# define STATIC noinline #endif -#else /* DEBUG */ +#else /* !DEBUG */ + +#ifdef XFS_WARN #define ASSERT(expr) \ - (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + (unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__)) #ifndef STATIC -# define STATIC noinline +# define STATIC static noinline +#endif + +#else /* !DEBUG && !XFS_WARN */ + +#define ASSERT(expr) ((void)0) + +#ifndef STATIC +# define STATIC static noinline #endif +#endif /* XFS_WARN */ #endif /* DEBUG */ #endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index eec226f78a40..d852a2b3e1fd 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1963,6 +1963,10 @@ xlog_write_calc_vec_length( headers++; for (lv = log_vector; lv; lv = lv->lv_next) { + /* we don't write ordered log vectors */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) + continue; + headers += lv->lv_niovecs; for (i = 0; i < lv->lv_niovecs; i++) { @@ -2216,7 +2220,7 @@ xlog_write( index = 0; lv = log_vector; vecp = lv->lv_iovecp; - while (lv && index < lv->lv_niovecs) { + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2236,13 +2240,22 @@ xlog_write( * This loop writes out as many regions as can fit in the amount * of space which was allocated by xlog_state_get_iclog_space(). */ - while (lv && index < lv->lv_niovecs) { - struct xfs_log_iovec *reg = &vecp[index]; + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { + struct xfs_log_iovec *reg; struct xlog_op_header *ophdr; int start_rec_copy; int copy_len; int copy_off; + bool ordered = false; + + /* ordered log vectors have no regions to write */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { + ASSERT(lv->lv_niovecs == 0); + ordered = true; + goto next_lv; + } + reg = &vecp[index]; ASSERT(reg->i_len % sizeof(__int32_t) == 0); ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); @@ -2302,12 +2315,13 @@ xlog_write( break; if (++index == lv->lv_niovecs) { +next_lv: lv = lv->lv_next; index = 0; if (lv) vecp = lv->lv_iovecp; } - if (record_cnt == 0) { + if (record_cnt == 0 && ordered == false) { if (!lv) return 0; break; @@ -3485,7 +3499,7 @@ xlog_ticket_alloc( tic->t_curr_res = unit_bytes; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = random32(); + tic->t_tid = prandom_u32(); tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; tic->t_trans_type = 0; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 5caee96059df..fb630e496c12 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -88,7 +88,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XLOG_REG_TYPE_UNMOUNT 17 #define XLOG_REG_TYPE_COMMIT 18 #define XLOG_REG_TYPE_TRANSHDR 19 -#define XLOG_REG_TYPE_MAX 19 +#define XLOG_REG_TYPE_ICREATE 20 +#define XLOG_REG_TYPE_MAX 20 typedef struct xfs_log_iovec { void *i_addr; /* beginning address of region */ @@ -105,6 +106,8 @@ struct xfs_log_vec { int lv_buf_len; /* size of formatted buffer */ }; +#define XFS_LOG_VEC_ORDERED (-1) + /* * Structure used to pass callback function and the function's argument * to the log manager. diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ddc4529d07d3..02b9cf3f8252 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -127,6 +127,7 @@ xlog_cil_prepare_log_vecs( int index; int len = 0; uint niovecs; + bool ordered = false; /* Skip items which aren't dirty in this transaction. */ if (!(lidp->lid_flags & XFS_LID_DIRTY)) @@ -137,14 +138,30 @@ xlog_cil_prepare_log_vecs( if (!niovecs) continue; + /* + * Ordered items need to be tracked but we do not wish to write + * them. We need a logvec to track the object, but we do not + * need an iovec or buffer to be allocated for copying data. + */ + if (niovecs == XFS_LOG_VEC_ORDERED) { + ordered = true; + niovecs = 0; + } + new_lv = kmem_zalloc(sizeof(*new_lv) + niovecs * sizeof(struct xfs_log_iovec), - KM_SLEEP); + KM_SLEEP|KM_NOFS); + + new_lv->lv_item = lidp->lid_item; + new_lv->lv_niovecs = niovecs; + if (ordered) { + /* track as an ordered logvec */ + new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + goto next; + } /* The allocated iovec region lies beyond the log vector. */ new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; - new_lv->lv_niovecs = niovecs; - new_lv->lv_item = lidp->lid_item; /* build the vector array and calculate it's length */ IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp); @@ -165,6 +182,7 @@ xlog_cil_prepare_log_vecs( } ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len); +next: if (!ret_lv) ret_lv = new_lv; else @@ -191,8 +209,18 @@ xfs_cil_prepare_item( if (old) { /* existing lv on log item, space used is a delta */ - ASSERT(!list_empty(&lv->lv_item->li_cil)); - ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); + ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) || + old->lv_buf_len == XFS_LOG_VEC_ORDERED); + + /* + * If the new item is ordered, keep the old one that is already + * tracking dirty or ordered regions + */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { + ASSERT(!lv->lv_buf); + kmem_free(lv); + return; + } *len += lv->lv_buf_len - old->lv_buf_len; *diff_iovecs += lv->lv_niovecs - old->lv_niovecs; @@ -201,10 +229,11 @@ xfs_cil_prepare_item( } else { /* new lv, must pin the log item */ ASSERT(!lv->lv_item->li_lv); - ASSERT(list_empty(&lv->lv_item->li_cil)); - *len += lv->lv_buf_len; - *diff_iovecs += lv->lv_niovecs; + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { + *len += lv->lv_buf_len; + *diff_iovecs += lv->lv_niovecs; + } IOP_PIN(lv->lv_item); } @@ -259,18 +288,24 @@ xlog_cil_insert_items( * We can do this safely because the context can't checkpoint until we * are done so it doesn't matter exactly how we update the CIL. */ - for (lv = log_vector; lv; lv = lv->lv_next) - xfs_cil_prepare_item(log, lv, &len, &diff_iovecs); - - /* account for space used by new iovec headers */ - len += diff_iovecs * sizeof(xlog_op_header_t); - spin_lock(&cil->xc_cil_lock); + for (lv = log_vector; lv; ) { + struct xfs_log_vec *next = lv->lv_next; - /* move the items to the tail of the CIL */ - for (lv = log_vector; lv; lv = lv->lv_next) + ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil)); + lv->lv_next = NULL; + + /* + * xfs_cil_prepare_item() may free the lv, so move the item on + * the CIL first. + */ list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil); + xfs_cil_prepare_item(log, lv, &len, &diff_iovecs); + lv = next; + } + /* account for space used by new iovec headers */ + len += diff_iovecs * sizeof(xlog_op_header_t); ctx->nvecs += diff_iovecs; /* @@ -381,9 +416,7 @@ xlog_cil_push( struct xfs_cil_ctx *new_ctx; struct xlog_in_core *commit_iclog; struct xlog_ticket *tic; - int num_lv; int num_iovecs; - int len; int error = 0; struct xfs_trans_header thdr; struct xfs_log_iovec lhdr; @@ -428,12 +461,9 @@ xlog_cil_push( * side which is currently locked out by the flush lock. */ lv = NULL; - num_lv = 0; num_iovecs = 0; - len = 0; while (!list_empty(&cil->xc_cil)) { struct xfs_log_item *item; - int i; item = list_first_entry(&cil->xc_cil, struct xfs_log_item, li_cil); @@ -444,11 +474,7 @@ xlog_cil_push( lv->lv_next = item->li_lv; lv = item->li_lv; item->li_lv = NULL; - - num_lv++; num_iovecs += lv->lv_niovecs; - for (i = 0; i < lv->lv_niovecs; i++) - len += lv->lv_iovecp[i].i_len; } /* @@ -668,10 +694,6 @@ xlog_cil_push_foreground( * transaction to the checkpoint context so we carry the busy extents through * to checkpoint completion, and then unlock all the items in the transaction. * - * For more specific information about the order of operations in - * xfs_log_commit_cil() please refer to the comments in - * xfs_trans_commit_iclog(). - * * Called with the context lock already held in read mode to lock out * background commit, returns without it held once background commits are * allowed again. @@ -705,6 +727,7 @@ xfs_log_commit_cil( if (commit_lsn) *commit_lsn = log->l_cilp->xc_ctx->sequence; + /* xlog_cil_insert_items() destroys log_vector list */ xlog_cil_insert_items(log, log_vector, tp->t_ticket); /* check we didn't blow the reservation */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 16d8d12ea3b4..b9ea262dd1c2 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -468,7 +468,6 @@ struct xfs_cil { * threshold, yet give us plenty of space for aggregation on large logs. */ #define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) -#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4)) /* * ticket grant locks, queues and accounting have their own cachlines diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index d1dba7ce75ae..6fcc910a50b9 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -29,6 +29,7 @@ #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" +#include "xfs_btree.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -44,6 +45,15 @@ #include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_icreate_item.h" + +/* Need all the magic numbers and buffer ops structures from these headers */ +#include "xfs_symlink.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" STATIC int xlog_find_zeroed( @@ -1590,10 +1600,53 @@ xlog_recover_add_to_trans( } /* - * Sort the log items in the transaction. Cancelled buffers need - * to be put first so they are processed before any items that might - * modify the buffers. If they are cancelled, then the modifications - * don't need to be replayed. + * Sort the log items in the transaction. + * + * The ordering constraints are defined by the inode allocation and unlink + * behaviour. The rules are: + * + * 1. Every item is only logged once in a given transaction. Hence it + * represents the last logged state of the item. Hence ordering is + * dependent on the order in which operations need to be performed so + * required initial conditions are always met. + * + * 2. Cancelled buffers are recorded in pass 1 in a separate table and + * there's nothing to replay from them so we can simply cull them + * from the transaction. However, we can't do that until after we've + * replayed all the other items because they may be dependent on the + * cancelled buffer and replaying the cancelled buffer can remove it + * form the cancelled buffer table. Hence they have tobe done last. + * + * 3. Inode allocation buffers must be replayed before inode items that + * read the buffer and replay changes into it. For filesystems using the + * ICREATE transactions, this means XFS_LI_ICREATE objects need to get + * treated the same as inode allocation buffers as they create and + * initialise the buffers directly. + * + * 4. Inode unlink buffers must be replayed after inode items are replayed. + * This ensures that inodes are completely flushed to the inode buffer + * in a "free" state before we remove the unlinked inode list pointer. + * + * Hence the ordering needs to be inode allocation buffers first, inode items + * second, inode unlink buffers third and cancelled buffers last. + * + * But there's a problem with that - we can't tell an inode allocation buffer + * apart from a regular buffer, so we can't separate them. We can, however, + * tell an inode unlink buffer from the others, and so we can separate them out + * from all the other buffers and move them to last. + * + * Hence, 4 lists, in order from head to tail: + * - buffer_list for all buffers except cancelled/inode unlink buffers + * - item_list for all non-buffer items + * - inode_buffer_list for inode unlink buffers + * - cancel_list for the cancelled buffers + * + * Note that we add objects to the tail of the lists so that first-to-last + * ordering is preserved within the lists. Adding objects to the head of the + * list means when we traverse from the head we walk them in last-to-first + * order. For cancelled buffers and inode unlink buffers this doesn't matter, + * but for all other items there may be specific ordering that we need to + * preserve. */ STATIC int xlog_recover_reorder_trans( @@ -1603,19 +1656,32 @@ xlog_recover_reorder_trans( { xlog_recover_item_t *item, *n; LIST_HEAD(sort_list); + LIST_HEAD(cancel_list); + LIST_HEAD(buffer_list); + LIST_HEAD(inode_buffer_list); + LIST_HEAD(inode_list); list_splice_init(&trans->r_itemq, &sort_list); list_for_each_entry_safe(item, n, &sort_list, ri_list) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; switch (ITEM_TYPE(item)) { + case XFS_LI_ICREATE: + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_BUF: - if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + if (buf_f->blf_flags & XFS_BLF_CANCEL) { trace_xfs_log_recover_item_reorder_head(log, trans, item, pass); - list_move(&item->ri_list, &trans->r_itemq); + list_move(&item->ri_list, &cancel_list); + break; + } + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + list_move(&item->ri_list, &inode_buffer_list); break; } + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_INODE: case XFS_LI_DQUOT: case XFS_LI_QUOTAOFF: @@ -1623,7 +1689,7 @@ xlog_recover_reorder_trans( case XFS_LI_EFI: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); - list_move_tail(&item->ri_list, &trans->r_itemq); + list_move_tail(&item->ri_list, &inode_list); break; default: xfs_warn(log->l_mp, @@ -1634,6 +1700,14 @@ xlog_recover_reorder_trans( } } ASSERT(list_empty(&sort_list)); + if (!list_empty(&buffer_list)) + list_splice(&buffer_list, &trans->r_itemq); + if (!list_empty(&inode_list)) + list_splice_tail(&inode_list, &trans->r_itemq); + if (!list_empty(&inode_buffer_list)) + list_splice_tail(&inode_buffer_list, &trans->r_itemq); + if (!list_empty(&cancel_list)) + list_splice_tail(&cancel_list, &trans->r_itemq); return 0; } @@ -1786,6 +1860,13 @@ xlog_recover_do_inode_buffer( trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + /* + * Post recovery validation only works properly on CRC enabled + * filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + bp->b_ops = &xfs_inode_buf_ops; + inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + @@ -1851,12 +1932,216 @@ xlog_recover_do_inode_buffer( buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; + + /* + * If necessary, recalculate the CRC in the on-disk inode. We + * have to leave the inode in a consistent state for whoever + * reads it next.... + */ + xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + } return 0; } /* + * Validate the recovered buffer is of the correct type and attach the + * appropriate buffer operations to them for writeback. Magic numbers are in a + * few places: + * the first 16 bits of the buffer (inode buffer, dquot buffer), + * the first 32 bits of the buffer (most blocks), + * inside a struct xfs_da_blkinfo at the start of the buffer. + */ +static void +xlog_recovery_validate_buf_type( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); + magic16 = be16_to_cpu(*(__be16*)bp->b_addr); + magicda = be16_to_cpu(info->magic); + switch (xfs_blft_from_flags(buf_f)) { + case XFS_BLFT_BTREE_BUF: + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + bp->b_ops = &xfs_allocbt_buf_ops; + break; + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: + bp->b_ops = &xfs_inobt_buf_ops; + break; + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: + bp->b_ops = &xfs_bmbt_buf_ops; + break; + default: + xfs_warn(mp, "Bad btree block magic!"); + ASSERT(0); + break; + } + break; + case XFS_BLFT_AGF_BUF: + if (magic32 != XFS_AGF_MAGIC) { + xfs_warn(mp, "Bad AGF block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agf_buf_ops; + break; + case XFS_BLFT_AGFL_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_AGFL_MAGIC) { + xfs_warn(mp, "Bad AGFL block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agfl_buf_ops; + break; + case XFS_BLFT_AGI_BUF: + if (magic32 != XFS_AGI_MAGIC) { + xfs_warn(mp, "Bad AGI block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agi_buf_ops; + break; + case XFS_BLFT_UDQUOT_BUF: + case XFS_BLFT_PDQUOT_BUF: + case XFS_BLFT_GDQUOT_BUF: +#ifdef CONFIG_XFS_QUOTA + if (magic16 != XFS_DQUOT_MAGIC) { + xfs_warn(mp, "Bad DQUOT block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dquot_buf_ops; +#else + xfs_alert(mp, + "Trying to recover dquots without QUOTA support built in!"); + ASSERT(0); +#endif + break; + case XFS_BLFT_DINO_BUF: + /* + * we get here with inode allocation buffers, not buffers that + * track unlinked list changes. + */ + if (magic16 != XFS_DINODE_MAGIC) { + xfs_warn(mp, "Bad INODE block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_inode_buf_ops; + break; + case XFS_BLFT_SYMLINK_BUF: + if (magic32 != XFS_SYMLINK_MAGIC) { + xfs_warn(mp, "Bad symlink block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_symlink_buf_ops; + break; + case XFS_BLFT_DIR_BLOCK_BUF: + if (magic32 != XFS_DIR2_BLOCK_MAGIC && + magic32 != XFS_DIR3_BLOCK_MAGIC) { + xfs_warn(mp, "Bad dir block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_block_buf_ops; + break; + case XFS_BLFT_DIR_DATA_BUF: + if (magic32 != XFS_DIR2_DATA_MAGIC && + magic32 != XFS_DIR3_DATA_MAGIC) { + xfs_warn(mp, "Bad dir data magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_data_buf_ops; + break; + case XFS_BLFT_DIR_FREE_BUF: + if (magic32 != XFS_DIR2_FREE_MAGIC && + magic32 != XFS_DIR3_FREE_MAGIC) { + xfs_warn(mp, "Bad dir3 free magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_free_buf_ops; + break; + case XFS_BLFT_DIR_LEAF1_BUF: + if (magicda != XFS_DIR2_LEAF1_MAGIC && + magicda != XFS_DIR3_LEAF1_MAGIC) { + xfs_warn(mp, "Bad dir leaf1 magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + break; + case XFS_BLFT_DIR_LEAFN_BUF: + if (magicda != XFS_DIR2_LEAFN_MAGIC && + magicda != XFS_DIR3_LEAFN_MAGIC) { + xfs_warn(mp, "Bad dir leafn magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leafn_buf_ops; + break; + case XFS_BLFT_DA_NODE_BUF: + if (magicda != XFS_DA_NODE_MAGIC && + magicda != XFS_DA3_NODE_MAGIC) { + xfs_warn(mp, "Bad da node magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_da3_node_buf_ops; + break; + case XFS_BLFT_ATTR_LEAF_BUF: + if (magicda != XFS_ATTR_LEAF_MAGIC && + magicda != XFS_ATTR3_LEAF_MAGIC) { + xfs_warn(mp, "Bad attr leaf magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_leaf_buf_ops; + break; + case XFS_BLFT_ATTR_RMT_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_ATTR3_RMT_MAGIC) { + xfs_warn(mp, "Bad attr remote magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_rmt_buf_ops; + break; + case XFS_BLFT_SB_BUF: + if (magic32 != XFS_SB_MAGIC) { + xfs_warn(mp, "Bad SB block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_sb_buf_ops; + break; + default: + xfs_warn(mp, "Unknown buffer type %d!", + xfs_blft_from_flags(buf_f)); + break; + } +} + +/* * Perform a 'normal' buffer recovery. Each logged region of the * buffer should be copied over the corresponding region in the * given buffer. The bitmap in the buf log format structure indicates @@ -1892,6 +2177,17 @@ xlog_recover_do_reg_buffer( ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; + + /* * Do a sanity check if this is a dquot buffer. Just checking * the first dquot in the buffer should do. XXXThis is * probably a good thing to do for other buf types also. @@ -1928,6 +2224,17 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); + + /* + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + xlog_recovery_validate_buf_type(mp, bp, buf_f); } /* @@ -2048,6 +2355,12 @@ xfs_qm_dqcheck( d->dd_diskdq.d_flags = type; d->dd_diskdq.d_id = cpu_to_be32(id); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + return errs; } @@ -2213,6 +2526,7 @@ xlog_recover_inode_pass2( int attr_index; uint fields; xfs_icdinode_t *dicp; + uint isize; int need_free = 0; if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { @@ -2238,7 +2552,7 @@ xlog_recover_inode_pass2( trace_xfs_log_recover_inode_recover(log, in_f); bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, - NULL); + &xfs_inode_buf_ops); if (!bp) { error = ENOMEM; goto error; @@ -2349,7 +2663,8 @@ xlog_recover_inode_pass2( error = EFSCORRUPTED; goto error; } - if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { + isize = xfs_icdinode_size(dicp->di_version); + if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); @@ -2361,13 +2676,13 @@ xlog_recover_inode_pass2( } /* The core is in in-core format */ - xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr); + xfs_dinode_to_disk(dip, dicp); /* the rest is in on-disk format */ - if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { - memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_len - sizeof(struct xfs_icdinode)); + if (item->ri_buf[1].i_len > isize) { + memcpy((char *)dip + isize, + item->ri_buf[1].i_addr + isize, + item->ri_buf[1].i_len - isize); } fields = in_f->ilf_fields; @@ -2451,6 +2766,9 @@ xlog_recover_inode_pass2( } write_inode_buffer: + /* re-generate the checksum. */ + xfs_dinode_calc_crc(log->l_mp, dip); + ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; xfs_buf_delwri_queue(bp, buffer_list); @@ -2570,6 +2888,10 @@ xlog_recover_dquot_pass2( } memcpy(ddq, recddq, item->ri_buf[1].i_len); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); @@ -2674,6 +2996,93 @@ xlog_recover_efd_pass2( } /* + * This routine is called when an inode create format structure is found in a + * committed transaction in the log. It's purpose is to initialise the inodes + * being allocated on disk. This requires us to get inode cluster buffers that + * match the range to be intialised, stamped with inode templates and written + * by delayed write so that subsequent modifications will hit the cached buffer + * and only need writing out at the end of recovery. + */ +STATIC int +xlog_recover_do_icreate_pass2( + struct xlog *log, + struct list_head *buffer_list, + xlog_recover_item_t *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_icreate_log *icl; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + unsigned int count; + unsigned int isize; + xfs_agblock_t length; + + icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; + if (icl->icl_type != XFS_LI_ICREATE) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); + return EINVAL; + } + + if (icl->icl_size != 1) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); + return EINVAL; + } + + agno = be32_to_cpu(icl->icl_ag); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); + return EINVAL; + } + agbno = be32_to_cpu(icl->icl_agbno); + if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); + return EINVAL; + } + isize = be32_to_cpu(icl->icl_isize); + if (isize != mp->m_sb.sb_inodesize) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); + return EINVAL; + } + count = be32_to_cpu(icl->icl_count); + if (!count) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); + return EINVAL; + } + length = be32_to_cpu(icl->icl_length); + if (!length || length >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); + return EINVAL; + } + + /* existing allocation is fixed value */ + ASSERT(count == XFS_IALLOC_INODES(mp)); + ASSERT(length == XFS_IALLOC_BLOCKS(mp)); + if (count != XFS_IALLOC_INODES(mp) || + length != XFS_IALLOC_BLOCKS(mp)) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); + return EINVAL; + } + + /* + * Inode buffers can be freed. Do not replay the inode initialisation as + * we could be overwriting something written after this inode buffer was + * cancelled. + * + * XXX: we need to iterate all buffers and only init those that are not + * cancelled. I think that a more fine grained factoring of + * xfs_ialloc_inode_init may be appropriate here to enable this to be + * done easily. + */ + if (xlog_check_buffer_cancelled(log, + XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) + return 0; + + xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, + be32_to_cpu(icl->icl_gen)); + return 0; +} + +/* * Free up any resources allocated by the transaction * * Remember that EFIs, EFDs, and IUNLINKs are handled later. @@ -2715,6 +3124,7 @@ xlog_recover_commit_pass1( case XFS_LI_EFI: case XFS_LI_EFD: case XFS_LI_DQUOT: + case XFS_LI_ICREATE: /* nothing to do in pass 1 */ return 0; default: @@ -2745,6 +3155,8 @@ xlog_recover_commit_pass2( return xlog_recover_efd_pass2(log, item); case XFS_LI_DQUOT: return xlog_recover_dquot_pass2(log, buffer_list, item); + case XFS_LI_ICREATE: + return xlog_recover_do_icreate_pass2(log, buffer_list, item); case XFS_LI_QUOTAOFF: /* nothing to do in pass2 */ return 0; @@ -2948,6 +3360,7 @@ xlog_recover_process_efi( * This will pull the EFI from the AIL and * free the memory associated with it. */ + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip, efip->efi_format.efi_nextents); return XFS_ERROR(EIO); } @@ -3751,6 +4164,25 @@ xlog_recover( return error; } + /* + * Version 5 superblock log feature mask validation. We know the + * log is dirty so check if there are any unknown log features + * in what we need to recover. If there are unknown features + * (e.g. unsupported transactions, then simply reject the + * attempt at recovery before touching anything. + */ + if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { + xfs_warn(log->l_mp, +"Superblock has unknown incompatible log features (0x%x) enabled.\n" +"The log can not be fully and/or safely recovered by this kernel.\n" +"Please recover the log on a kernel that supports the unknown features.", + (log->l_mp->m_sb.sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); + return EINVAL; + } + xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", log->l_mp->m_logname ? log->l_mp->m_logname : "internal"); diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 331cd9f83a7f..9163dc140532 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -93,6 +93,14 @@ xfs_alert_tag( } void +asswarn(char *expr, char *file, int line) +{ + xfs_warn(NULL, "Assertion failed: %s, file: %s, line: %d", + expr, file, line); + WARN_ON(1); +} + +void assfail(char *expr, char *file, int line) { xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d", diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 56dc0c17f16a..85401155750e 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -30,7 +30,34 @@ void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) } #endif +#define xfs_printk_ratelimited(func, dev, fmt, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + func(dev, fmt, ##__VA_ARGS__); \ +} while (0) + +#define xfs_emerg_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__) +#define xfs_alert_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_alert, dev, fmt, ##__VA_ARGS__) +#define xfs_crit_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_crit, dev, fmt, ##__VA_ARGS__) +#define xfs_err_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_err, dev, fmt, ##__VA_ARGS__) +#define xfs_warn_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_warn, dev, fmt, ##__VA_ARGS__) +#define xfs_notice_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_notice, dev, fmt, ##__VA_ARGS__) +#define xfs_info_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_info, dev, fmt, ##__VA_ARGS__) +#define xfs_debug_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) + extern void assfail(char *expr, char *f, int l); +extern void asswarn(char *expr, char *f, int l); extern void xfs_hex_dump(void *p, int length); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 3806088a8f77..2b0ba3581656 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -43,6 +43,8 @@ #include "xfs_utils.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" #ifdef HAVE_PERCPU_SB @@ -109,6 +111,14 @@ static const struct { { offsetof(xfs_sb_t, sb_logsunit), 0 }, { offsetof(xfs_sb_t, sb_features2), 0 }, { offsetof(xfs_sb_t, sb_bad_features2), 0 }, + { offsetof(xfs_sb_t, sb_features_compat), 0 }, + { offsetof(xfs_sb_t, sb_features_ro_compat), 0 }, + { offsetof(xfs_sb_t, sb_features_incompat), 0 }, + { offsetof(xfs_sb_t, sb_features_log_incompat), 0 }, + { offsetof(xfs_sb_t, sb_crc), 0 }, + { offsetof(xfs_sb_t, sb_pad), 0 }, + { offsetof(xfs_sb_t, sb_pquotino), 0 }, + { offsetof(xfs_sb_t, sb_lsn), 0 }, { sizeof(xfs_sb_t), 0 } }; @@ -304,7 +314,8 @@ STATIC int xfs_mount_validate_sb( xfs_mount_t *mp, xfs_sb_t *sbp, - bool check_inprogress) + bool check_inprogress, + bool check_version) { /* @@ -319,11 +330,63 @@ xfs_mount_validate_sb( return XFS_ERROR(EWRONGFS); } + if (!xfs_sb_good_version(sbp)) { xfs_warn(mp, "bad version"); return XFS_ERROR(EWRONGFS); } + if ((sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) && + (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | + XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))) { + xfs_notice(mp, +"Super block has XFS_OQUOTA bits along with XFS_PQUOTA and/or XFS_GQUOTA bits.\n"); + return XFS_ERROR(EFSCORRUPTED); + } + + /* + * Version 5 superblock feature mask validation. Reject combinations the + * kernel cannot support up front before checking anything else. For + * write validation, we don't need to check feature masks. + */ + if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { + xfs_alert(mp, +"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" +"Use of these features in this kernel is at your own risk!"); + + if (xfs_sb_has_compat_feature(sbp, + XFS_SB_FEAT_COMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown compatible features (0x%x) enabled.\n" +"Using a more recent kernel is recommended.", + (sbp->sb_features_compat & + XFS_SB_FEAT_COMPAT_UNKNOWN)); + } + + if (xfs_sb_has_ro_compat_feature(sbp, + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_alert(mp, +"Superblock has unknown read-only compatible features (0x%x) enabled.", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_warn(mp, +"Attempted to mount read-only compatible filesystem read-write.\n" +"Filesystem can only be safely mounted read only."); + return XFS_ERROR(EINVAL); + } + } + if (xfs_sb_has_incompat_feature(sbp, + XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown incompatible features (0x%x) enabled.\n" +"Filesystem can not be safely mounted by this kernel.", + (sbp->sb_features_incompat & + XFS_SB_FEAT_INCOMPAT_UNKNOWN)); + return XFS_ERROR(EINVAL); + } + } + if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { xfs_warn(mp, @@ -506,6 +569,18 @@ out_unwind: return error; } +static void +xfs_sb_quota_from_disk(struct xfs_sb *sbp) +{ + if (sbp->sb_qflags & XFS_OQUOTA_ENFD) + sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? + XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD; + if (sbp->sb_qflags & XFS_OQUOTA_CHKD) + sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? + XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD; + sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD); +} + void xfs_sb_from_disk( struct xfs_sb *to, @@ -557,6 +632,43 @@ xfs_sb_from_disk( to->sb_logsunit = be32_to_cpu(from->sb_logsunit); to->sb_features2 = be32_to_cpu(from->sb_features2); to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2); + to->sb_features_compat = be32_to_cpu(from->sb_features_compat); + to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat); + to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat); + to->sb_features_log_incompat = + be32_to_cpu(from->sb_features_log_incompat); + to->sb_pad = 0; + to->sb_pquotino = be64_to_cpu(from->sb_pquotino); + to->sb_lsn = be64_to_cpu(from->sb_lsn); +} + +static inline void +xfs_sb_quota_to_disk( + xfs_dsb_t *to, + xfs_sb_t *from, + __int64_t *fields) +{ + __uint16_t qflags = from->sb_qflags; + + if (*fields & XFS_SB_QFLAGS) { + /* + * The in-core version of sb_qflags do not have + * XFS_OQUOTA_* flags, whereas the on-disk version + * does. So, convert incore XFS_{PG}QUOTA_* flags + * to on-disk XFS_OQUOTA_* flags. + */ + qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD | + XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD); + + if (from->sb_qflags & + (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD)) + qflags |= XFS_OQUOTA_ENFD; + if (from->sb_qflags & + (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) + qflags |= XFS_OQUOTA_CHKD; + to->sb_qflags = cpu_to_be16(qflags); + *fields &= ~XFS_SB_QFLAGS; + } } /* @@ -580,6 +692,7 @@ xfs_sb_to_disk( if (!fields) return; + xfs_sb_quota_to_disk(to, from, &fields); while (fields) { f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); first = xfs_sb_info[f].offset; @@ -612,13 +725,13 @@ xfs_sb_to_disk( } } -static void +static int xfs_sb_verify( - struct xfs_buf *bp) + struct xfs_buf *bp, + bool check_version) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_sb sb; - int error; xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); @@ -626,16 +739,47 @@ xfs_sb_verify( * Only check the in progress field for the primary superblock as * mkfs.xfs doesn't clear it from secondary superblocks. */ - error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); - if (error) - xfs_buf_ioerror(bp, error); + return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, + check_version); } +/* + * If the superblock has the CRC feature bit set or the CRC field is non-null, + * check that the CRC is valid. We check the CRC field is non-null because a + * single bit error could clear the feature bit and unused parts of the + * superblock are supposed to be zero. Hence a non-null crc field indicates that + * we've potentially lost a feature bit and we should check it anyway. + */ static void xfs_sb_read_verify( struct xfs_buf *bp) { - xfs_sb_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + int error; + + /* + * open code the version check to avoid needing to convert the entire + * superblock from disk order just to check the version number + */ + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) && + (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) == + XFS_SB_VERSION_5) || + dsb->sb_crc != 0)) { + + if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize), + offsetof(struct xfs_sb, sb_crc))) { + error = EFSCORRUPTED; + goto out_error; + } + } + error = xfs_sb_verify(bp, true); + +out_error: + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, error); + } } /* @@ -648,11 +792,10 @@ static void xfs_sb_quiet_read_verify( struct xfs_buf *bp) { - struct xfs_sb sb; + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); - xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); - if (sb.sb_magicnum == XFS_SB_MAGIC) { + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { /* XFS filesystem, verify noisily! */ xfs_sb_read_verify(bp); return; @@ -663,9 +806,27 @@ xfs_sb_quiet_read_verify( static void xfs_sb_write_verify( - struct xfs_buf *bp) + struct xfs_buf *bp) { - xfs_sb_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + int error; + + error = xfs_sb_verify(bp, false); + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, error); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_sb, sb_crc)); } const struct xfs_buf_ops xfs_sb_buf_ops = { @@ -687,7 +848,8 @@ int xfs_readsb(xfs_mount_t *mp, int flags) { unsigned int sector_size; - xfs_buf_t *bp; + struct xfs_buf *bp; + struct xfs_sb *sbp = &mp->m_sb; int error; int loud = !(flags & XFS_MFSI_QUIET); @@ -714,7 +876,7 @@ reread: if (bp->b_error) { error = bp->b_error; if (loud) - xfs_warn(mp, "SB validate failed"); + xfs_warn(mp, "SB validate failed with error %d.", error); goto release_buf; } @@ -723,13 +885,14 @@ reread: */ xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); + xfs_sb_quota_from_disk(&mp->m_sb); /* * We must be able to do sector-sized and sector-aligned IO. */ - if (sector_size > mp->m_sb.sb_sectsize) { + if (sector_size > sbp->sb_sectsize) { if (loud) xfs_warn(mp, "device supports %u byte sectors (not %u)", - sector_size, mp->m_sb.sb_sectsize); + sector_size, sbp->sb_sectsize); error = ENOSYS; goto release_buf; } @@ -738,15 +901,18 @@ reread: * If device sector size is smaller than the superblock size, * re-read the superblock so the buffer is correctly sized. */ - if (sector_size < mp->m_sb.sb_sectsize) { + if (sector_size < sbp->sb_sectsize) { xfs_buf_relse(bp); - sector_size = mp->m_sb.sb_sectsize; + sector_size = sbp->sb_sectsize; goto reread; } /* Initialize per-cpu counters */ xfs_icsb_reinit_counters(mp); + /* no need to be quiet anymore, so reset the buf ops */ + bp->b_ops = &xfs_sb_buf_ops; + mp->m_sb_bp = bp; xfs_buf_unlock(bp); return 0; @@ -872,42 +1038,27 @@ xfs_update_alignment(xfs_mount_t *mp) */ if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || (BBTOB(mp->m_swidth) & mp->m_blockmask)) { - if (mp->m_flags & XFS_MOUNT_RETERR) { - xfs_warn(mp, "alignment check failed: " - "(sunit/swidth vs. blocksize)"); - return XFS_ERROR(EINVAL); - } - mp->m_dalign = mp->m_swidth = 0; + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. blocksize(%d)", + sbp->sb_blocksize); + return XFS_ERROR(EINVAL); } else { /* * Convert the stripe unit and width to FSBs. */ mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { - if (mp->m_flags & XFS_MOUNT_RETERR) { - xfs_warn(mp, "alignment check failed: " - "(sunit/swidth vs. ag size)"); - return XFS_ERROR(EINVAL); - } xfs_warn(mp, - "stripe alignment turned off: sunit(%d)/swidth(%d) " - "incompatible with agsize(%d)", - mp->m_dalign, mp->m_swidth, - sbp->sb_agblocks); - - mp->m_dalign = 0; - mp->m_swidth = 0; + "alignment check failed: sunit/swidth vs. agsize(%d)", + sbp->sb_agblocks); + return XFS_ERROR(EINVAL); } else if (mp->m_dalign) { mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); } else { - if (mp->m_flags & XFS_MOUNT_RETERR) { - xfs_warn(mp, "alignment check failed: " - "sunit(%d) less than bsize(%d)", - mp->m_dalign, - mp->m_blockmask +1); - return XFS_ERROR(EINVAL); - } - mp->m_swidth = 0; + xfs_warn(mp, + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, sbp->sb_blocksize); + return XFS_ERROR(EINVAL); } } @@ -924,6 +1075,10 @@ xfs_update_alignment(xfs_mount_t *mp) sbp->sb_width = mp->m_swidth; mp->m_update_flags |= XFS_SB_WIDTH; } + } else { + xfs_warn(mp, + "cannot change alignment: superblock does not support data alignment"); + return XFS_ERROR(EINVAL); } } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && xfs_sb_version_hasdalign(&mp->m_sb)) { @@ -1633,6 +1788,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) ASSERT((1LL << f) & XFS_SB_MOD_BITS); first = xfs_sb_info[f].offset; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(tp, bp, first, last); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index bc907061d392..4e374d4a9189 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -192,8 +192,6 @@ typedef struct xfs_mount { xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */ xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */ uint m_chsize; /* size of next field */ - struct xfs_chash *m_chash; /* fs private inode per-cluster - * hash table */ atomic_t m_active_trans; /* number trans frozen */ #ifdef HAVE_PERCPU_SB xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ @@ -207,7 +205,6 @@ typedef struct xfs_mount { trimming */ __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ - struct shrinker m_inode_shrink; /* inode reclaim shrinker */ int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ @@ -230,8 +227,6 @@ typedef struct xfs_mount { operations, typically for disk errors in metadata */ #define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ -#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to - user */ #define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment allocations */ #define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ @@ -392,6 +387,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); #endif /* __KERNEL__ */ +extern void xfs_sb_calc_crc(struct xfs_buf *); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index e5b5cf973781..7a3e007b49f4 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -41,6 +41,7 @@ #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_cksum.h" /* * The global quota manager. There is only one of these for the entire @@ -69,7 +70,7 @@ xfs_qm_dquot_walk( void *data) { struct xfs_quotainfo *qi = mp->m_quotainfo; - struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); uint32_t next_index; int last_error = 0; int skipped; @@ -188,7 +189,7 @@ xfs_qm_dqpurge( xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), + radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), be32_to_cpu(dqp->q_core.d_id)); qi->qi_dquots--; @@ -298,8 +299,10 @@ xfs_qm_mount_quotas( */ if (!XFS_IS_UQUOTA_ON(mp)) mp->m_qflags &= ~XFS_UQUOTA_CHKD; - if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) - mp->m_qflags &= ~XFS_OQUOTA_CHKD; + if (!XFS_IS_GQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_GQUOTA_CHKD; + if (!XFS_IS_PQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_PQUOTA_CHKD; write_changes: /* @@ -488,8 +491,7 @@ xfs_qm_need_dqattach( return false; if (!XFS_NOT_DQATTACHED(mp, ip)) return false; - if (ip->i_ino == mp->m_sb.sb_uquotino || - ip->i_ino == mp->m_sb.sb_gquotino) + if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) return false; return true; } @@ -605,8 +607,7 @@ xfs_qm_dqdetach( trace_xfs_dquot_dqdetach(ip); - ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino); - ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino); + ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino)); if (ip->i_udquot) { xfs_qm_dqrele(ip->i_udquot); ip->i_udquot = NULL; @@ -617,6 +618,20 @@ xfs_qm_dqdetach( } } +int +xfs_qm_calc_dquots_per_chunk( + struct xfs_mount *mp, + unsigned int nbblks) /* basic block units */ +{ + unsigned int ndquots; + + ASSERT(nbblks > 0); + ndquots = BBTOB(nbblks); + do_div(ndquots, sizeof(xfs_dqblk_t)); + + return ndquots; +} + /* * This initializes all the quota information that's kept in the * mount structure @@ -656,9 +671,8 @@ xfs_qm_init_quotainfo( /* Precalc some constants */ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); - ASSERT(qinf->qi_dqchunklen); - qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen); - do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t)); + qinf->qi_dqperchunk = xfs_qm_calc_dquots_per_chunk(mp, + qinf->qi_dqchunklen); mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); @@ -826,7 +840,7 @@ xfs_qm_reset_dqcounts( xfs_dqid_t id, uint type) { - xfs_disk_dquot_t *ddq; + struct xfs_dqblk *dqb; int j; trace_xfs_reset_dqcounts(bp, _RET_IP_); @@ -840,8 +854,12 @@ xfs_qm_reset_dqcounts( do_div(j, sizeof(xfs_dqblk_t)); ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif - ddq = bp->b_addr; + dqb = bp->b_addr; for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { + struct xfs_disk_dquot *ddq; + + ddq = (struct xfs_disk_dquot *)&dqb[j]; + /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to @@ -858,7 +876,12 @@ xfs_qm_reset_dqcounts( ddq->d_bwarns = 0; ddq->d_iwarns = 0; ddq->d_rtbwarns = 0; - ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)&dqb[j], + sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } } } @@ -894,15 +917,29 @@ xfs_qm_dqiter_bufs( XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); + + /* + * CRC and validation errors will return a EFSCORRUPTED here. If + * this occurs, re-read without CRC validation so that we can + * repair the damage via xfs_qm_reset_dqcounts(). This process + * will leave a trace in the log indicating corruption has + * been detected. + */ + if (error == EFSCORRUPTED) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, bno), + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + NULL); + } + if (error) break; xfs_qm_reset_dqcounts(mp, bp, firstid, type); xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); - /* - * goto the next block. - */ + + /* goto the next block. */ bno++; firstid += mp->m_quotainfo->qi_dqperchunk; } @@ -1057,7 +1094,7 @@ xfs_qm_quotacheck_dqadjust( * There are no timers for the default values set in the root dquot. */ if (dqp->q_core.d_id) { - xfs_qm_adjust_dqlimits(mp, &dqp->q_core); + xfs_qm_adjust_dqlimits(mp, dqp); xfs_qm_adjust_dqtimers(mp, &dqp->q_core); } @@ -1115,7 +1152,7 @@ xfs_qm_dqusage_adjust( * rootino must have its resources accounted for, not so with the quota * inodes. */ - if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { + if (xfs_is_quota_inode(&mp->m_sb, ino)) { *res = BULKSTAT_RV_NOTHING; return XFS_ERROR(EINVAL); } @@ -1225,19 +1262,20 @@ int xfs_qm_quotacheck( xfs_mount_t *mp) { - int done, count, error, error2; - xfs_ino_t lastino; - size_t structsz; - xfs_inode_t *uip, *gip; - uint flags; - LIST_HEAD (buffer_list); + int done, count, error, error2; + xfs_ino_t lastino; + size_t structsz; + uint flags; + LIST_HEAD (buffer_list); + struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip; + struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip; count = INT_MAX; structsz = 1; lastino = 0; flags = 0; - ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip); + ASSERT(uip || gip); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); xfs_notice(mp, "Quotacheck needed: Please wait."); @@ -1247,7 +1285,6 @@ xfs_qm_quotacheck( * their counters to zero. We need a clean slate. * We don't log our changes till later. */ - uip = mp->m_quotainfo->qi_uquotaip; if (uip) { error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, &buffer_list); @@ -1256,14 +1293,14 @@ xfs_qm_quotacheck( flags |= XFS_UQUOTA_CHKD; } - gip = mp->m_quotainfo->qi_gquotaip; if (gip) { error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA, &buffer_list); if (error) goto error_return; - flags |= XFS_OQUOTA_CHKD; + flags |= XFS_IS_GQUOTA_ON(mp) ? + XFS_GQUOTA_CHKD : XFS_PQUOTA_CHKD; } do { @@ -1358,15 +1395,13 @@ STATIC int xfs_qm_init_quotainos( xfs_mount_t *mp) { - xfs_inode_t *uip, *gip; - int error; - __int64_t sbflags; - uint flags; + struct xfs_inode *uip = NULL; + struct xfs_inode *gip = NULL; + int error; + __int64_t sbflags = 0; + uint flags = 0; ASSERT(mp->m_quotainfo); - uip = gip = NULL; - sbflags = 0; - flags = 0; /* * Get the uquota and gquota inodes @@ -1375,19 +1410,18 @@ xfs_qm_init_quotainos( if (XFS_IS_UQUOTA_ON(mp) && mp->m_sb.sb_uquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_uquotino > 0); - if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, - 0, 0, &uip))) + error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, + 0, 0, &uip); + if (error) return XFS_ERROR(error); } if (XFS_IS_OQUOTA_ON(mp) && mp->m_sb.sb_gquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_gquotino > 0); - if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, - 0, 0, &gip))) { - if (uip) - IRELE(uip); - return XFS_ERROR(error); - } + error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, + 0, 0, &gip); + if (error) + goto error_rele; } } else { flags |= XFS_QMOPT_SBVERSION; @@ -1402,10 +1436,11 @@ xfs_qm_init_quotainos( * temporarily switch to read-write to do this. */ if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { - if ((error = xfs_qm_qino_alloc(mp, &uip, + error = xfs_qm_qino_alloc(mp, &uip, sbflags | XFS_SB_UQUOTINO, - flags | XFS_QMOPT_UQUOTA))) - return XFS_ERROR(error); + flags | XFS_QMOPT_UQUOTA); + if (error) + goto error_rele; flags &= ~XFS_QMOPT_SBVERSION; } @@ -1414,18 +1449,21 @@ xfs_qm_init_quotainos( XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); error = xfs_qm_qino_alloc(mp, &gip, sbflags | XFS_SB_GQUOTINO, flags); - if (error) { - if (uip) - IRELE(uip); - - return XFS_ERROR(error); - } + if (error) + goto error_rele; } mp->m_quotainfo->qi_uquotaip = uip; mp->m_quotainfo->qi_gquotaip = gip; return 0; + +error_rele: + if (uip) + IRELE(uip); + if (gip) + IRELE(gip); + return XFS_ERROR(error); } STATIC void @@ -1436,7 +1474,7 @@ xfs_qm_dqfree_one( struct xfs_quotainfo *qi = mp->m_quotainfo; mutex_lock(&qi->qi_tree_lock); - radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), + radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), be32_to_cpu(dqp->q_core.d_id)); qi->qi_dquots--; @@ -1622,7 +1660,8 @@ xfs_qm_vop_dqalloc( struct xfs_dquot **O_gdqpp) { struct xfs_mount *mp = ip->i_mount; - struct xfs_dquot *uq, *gq; + struct xfs_dquot *uq = NULL; + struct xfs_dquot *gq = NULL; int error; uint lockflags; @@ -1647,7 +1686,6 @@ xfs_qm_vop_dqalloc( } } - uq = gq = NULL; if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { if (ip->i_d.di_uid != uid) { /* @@ -1660,11 +1698,12 @@ xfs_qm_vop_dqalloc( * holding ilock. */ xfs_iunlock(ip, lockflags); - if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid, + error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid, XFS_DQ_USER, XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, - &uq))) { + &uq); + if (error) { ASSERT(error != ENOENT); return error; } @@ -1686,15 +1725,14 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { if (ip->i_d.di_gid != gid) { xfs_iunlock(ip, lockflags); - if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid, + error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid, XFS_DQ_GROUP, XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, - &gq))) { - if (uq) - xfs_qm_dqrele(uq); + &gq); + if (error) { ASSERT(error != ENOENT); - return error; + goto error_rele; } xfs_dqunlock(gq); lockflags = XFS_ILOCK_SHARED; @@ -1706,15 +1744,14 @@ xfs_qm_vop_dqalloc( } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { if (xfs_get_projid(ip) != prid) { xfs_iunlock(ip, lockflags); - if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, + error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, XFS_DQ_PROJ, XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, - &gq))) { - if (uq) - xfs_qm_dqrele(uq); + &gq); + if (error) { ASSERT(error != ENOENT); - return (error); + goto error_rele; } xfs_dqunlock(gq); lockflags = XFS_ILOCK_SHARED; @@ -1737,6 +1774,11 @@ xfs_qm_vop_dqalloc( else if (gq) xfs_qm_dqrele(gq); return 0; + +error_rele: + if (uq) + xfs_qm_dqrele(uq); + return error; } /* @@ -1784,29 +1826,31 @@ xfs_qm_vop_chown( */ int xfs_qm_vop_chown_reserve( - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_dquot_t *udqp, - xfs_dquot_t *gdqp, - uint flags) + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + uint flags) { - xfs_mount_t *mp = ip->i_mount; - uint delblks, blkflags, prjflags = 0; - xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; - int error; + struct xfs_mount *mp = ip->i_mount; + uint delblks, blkflags, prjflags = 0; + struct xfs_dquot *udq_unres = NULL; + struct xfs_dquot *gdq_unres = NULL; + struct xfs_dquot *udq_delblks = NULL; + struct xfs_dquot *gdq_delblks = NULL; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); delblks = ip->i_delayed_blks; - delblksudq = delblksgdq = unresudq = unresgdq = NULL; blkflags = XFS_IS_REALTIME_INODE(ip) ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; if (XFS_IS_UQUOTA_ON(mp) && udqp && ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) { - delblksudq = udqp; + udq_delblks = udqp; /* * If there are delayed allocation blocks, then we have to * unreserve those from the old dquot, and add them to the @@ -1814,7 +1858,7 @@ xfs_qm_vop_chown_reserve( */ if (delblks) { ASSERT(ip->i_udquot); - unresudq = ip->i_udquot; + udq_unres = ip->i_udquot; } } if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { @@ -1825,18 +1869,19 @@ xfs_qm_vop_chown_reserve( if (prjflags || (XFS_IS_GQUOTA_ON(ip->i_mount) && ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) { - delblksgdq = gdqp; + gdq_delblks = gdqp; if (delblks) { ASSERT(ip->i_gdquot); - unresgdq = ip->i_gdquot; + gdq_unres = ip->i_gdquot; } } } - if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, - delblksudq, delblksgdq, ip->i_d.di_nblocks, 1, - flags | blkflags | prjflags))) - return (error); + error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, + udq_delblks, gdq_delblks, ip->i_d.di_nblocks, 1, + flags | blkflags | prjflags); + if (error) + return error; /* * Do the delayed blks reservations/unreservations now. Since, these @@ -1848,14 +1893,15 @@ xfs_qm_vop_chown_reserve( /* * Do the reservations first. Unreservation can't fail. */ - ASSERT(delblksudq || delblksgdq); - ASSERT(unresudq || unresgdq); - if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, - delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0, - flags | blkflags | prjflags))) - return (error); + ASSERT(udq_delblks || gdq_delblks); + ASSERT(udq_unres || gdq_unres); + error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, + udq_delblks, gdq_delblks, (xfs_qcnt_t)delblks, 0, + flags | blkflags | prjflags); + if (error) + return error; xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, - unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0, + udq_unres, gdq_unres, -((xfs_qcnt_t)delblks), 0, blkflags); } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 44b858b79d71..bdb4f8b95207 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -69,28 +69,62 @@ typedef struct xfs_quotainfo { struct shrinker qi_shrinker; } xfs_quotainfo_t; -#define XFS_DQUOT_TREE(qi, type) \ - ((type & XFS_DQ_USER) ? \ - &((qi)->qi_uquota_tree) : \ - &((qi)->qi_gquota_tree)) +static inline struct radix_tree_root * +xfs_dquot_tree( + struct xfs_quotainfo *qi, + int type) +{ + switch (type) { + case XFS_DQ_USER: + return &qi->qi_uquota_tree; + case XFS_DQ_GROUP: + case XFS_DQ_PROJ: + return &qi->qi_gquota_tree; + default: + ASSERT(0); + } + return NULL; +} +static inline struct xfs_inode * +xfs_dq_to_quota_inode(struct xfs_dquot *dqp) +{ + switch (dqp->dq_flags & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return dqp->q_mount->m_quotainfo->qi_uquotaip; + case XFS_DQ_GROUP: + case XFS_DQ_PROJ: + return dqp->q_mount->m_quotainfo->qi_gquotaip; + default: + ASSERT(0); + } + return NULL; +} -extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long); -extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *, - xfs_dquot_t *, xfs_dquot_t *, long, long, uint); -extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *); -extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *); +extern int xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp, + unsigned int nbblks); +extern void xfs_trans_mod_dquot(struct xfs_trans *, + struct xfs_dquot *, uint, long); +extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, + struct xfs_mount *, struct xfs_dquot *, + struct xfs_dquot *, long, long, uint); +extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *); +extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *); /* * We keep the usr and grp dquots separately so that locking will be easier * to do at commit time. All transactions that we know of at this point * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value. */ +enum { + XFS_QM_TRANS_USR = 0, + XFS_QM_TRANS_GRP, + XFS_QM_TRANS_DQTYPES +}; #define XFS_QM_TRANS_MAXDQS 2 -typedef struct xfs_dquot_acct { - xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS]; - xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS]; -} xfs_dquot_acct_t; +struct xfs_dquot_acct { + struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS]; +}; /* * Users are allowed to have a usage exceeding their softlimit for @@ -104,22 +138,23 @@ typedef struct xfs_dquot_acct { #define XFS_QM_IWARNLIMIT 5 #define XFS_QM_RTBWARNLIMIT 5 -extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); -extern int xfs_qm_quotacheck(xfs_mount_t *); -extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); +extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); +extern int xfs_qm_quotacheck(struct xfs_mount *); +extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t); /* dquot stuff */ -extern void xfs_qm_dqpurge_all(xfs_mount_t *, uint); -extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); +extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint); +extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); /* quota ops */ -extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint); -extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint, - fs_disk_quota_t *); -extern int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint, - fs_disk_quota_t *); -extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); -extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); -extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); +extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); +extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, + uint, struct fs_disk_quota *); +extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, + struct fs_disk_quota *); +extern int xfs_qm_scall_getqstat(struct xfs_mount *, + struct fs_quota_stat *); +extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); +extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); #endif /* __XFS_QM_H__ */ diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index cf9a34051e07..a08801ae24e2 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -117,11 +117,11 @@ xfs_qm_scall_quotaoff( } if (flags & XFS_GQUOTA_ACCT) { dqtype |= XFS_QMOPT_GQUOTA; - flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); + flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD); inactivate_flags |= XFS_GQUOTA_ACTIVE; } else if (flags & XFS_PQUOTA_ACCT) { dqtype |= XFS_QMOPT_PQUOTA; - flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); + flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD); inactivate_flags |= XFS_PQUOTA_ACTIVE; } @@ -335,14 +335,14 @@ xfs_qm_scall_quotaon( * quota acct on ondisk without m_qflags' knowing. */ if (((flags & XFS_UQUOTA_ACCT) == 0 && - (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && - (flags & XFS_UQUOTA_ENFD)) - || + (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && + (flags & XFS_UQUOTA_ENFD)) || + ((flags & XFS_GQUOTA_ACCT) == 0 && + (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && + (flags & XFS_GQUOTA_ENFD)) || ((flags & XFS_PQUOTA_ACCT) == 0 && - (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && - (flags & XFS_GQUOTA_ACCT) == 0 && - (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && - (flags & XFS_OQUOTA_ENFD))) { + (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && + (flags & XFS_PQUOTA_ENFD))) { xfs_debug(mp, "%s: Can't enforce without acct, flags=%x sbflags=%x\n", __func__, flags, mp->m_sb.sb_qflags); @@ -407,11 +407,11 @@ xfs_qm_scall_getqstat( struct fs_quota_stat *out) { struct xfs_quotainfo *q = mp->m_quotainfo; - struct xfs_inode *uip, *gip; - bool tempuqip, tempgqip; + struct xfs_inode *uip = NULL; + struct xfs_inode *gip = NULL; + bool tempuqip = false; + bool tempgqip = false; - uip = gip = NULL; - tempuqip = tempgqip = false; memset(out, 0, sizeof(fs_quota_stat_t)); out->qs_version = FS_QSTAT_VERSION; @@ -472,15 +472,15 @@ xfs_qm_scall_getqstat( */ int xfs_qm_scall_setqlim( - xfs_mount_t *mp, + struct xfs_mount *mp, xfs_dqid_t id, uint type, fs_disk_quota_t *newlim) { struct xfs_quotainfo *q = mp->m_quotainfo; - xfs_disk_dquot_t *ddq; - xfs_dquot_t *dqp; - xfs_trans_t *tp; + struct xfs_disk_dquot *ddq; + struct xfs_dquot *dqp; + struct xfs_trans *tp; int error; xfs_qcnt_t hard, soft; @@ -489,31 +489,36 @@ xfs_qm_scall_setqlim( if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) return 0; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), - 0, 0, XFS_DEFAULT_LOG_COUNT); - if (error) { - xfs_trans_cancel(tp, 0); - return (error); - } - /* * We don't want to race with a quotaoff so take the quotaoff lock. - * (We don't hold an inode lock, so there's nothing else to stop - * a quotaoff from happening). (XXXThis doesn't currently happen - * because we take the vfslock before calling xfs_qm_sysent). + * We don't hold an inode lock, so there's nothing else to stop + * a quotaoff from happening. */ mutex_lock(&q->qi_quotaofflock); /* - * Get the dquot (locked), and join it to the transaction. - * Allocate the dquot if this doesn't exist. + * Get the dquot (locked) before we start, as we need to do a + * transaction to allocate it if it doesn't exist. Once we have the + * dquot, unlock it so we can start the next transaction safely. We hold + * a reference to the dquot, so it's safe to do this unlock/lock without + * it being reclaimed in the mean time. */ - if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp); + if (error) { ASSERT(error != ENOENT); goto out_unlock; } + xfs_dqunlock(dqp); + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); + error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_rele; + } + + xfs_dqlock(dqp); xfs_trans_dqjoin(tp, dqp); ddq = &dqp->q_core; @@ -529,6 +534,7 @@ xfs_qm_scall_setqlim( if (hard == 0 || hard >= soft) { ddq->d_blk_hardlimit = cpu_to_be64(hard); ddq->d_blk_softlimit = cpu_to_be64(soft); + xfs_dquot_set_prealloc_limits(dqp); if (id == 0) { q->qi_bhardlimit = hard; q->qi_bsoftlimit = soft; @@ -620,9 +626,10 @@ xfs_qm_scall_setqlim( xfs_trans_log_dquot(tp, dqp); error = xfs_trans_commit(tp, 0); - xfs_qm_dqrele(dqp); - out_unlock: +out_rele: + xfs_qm_dqrele(dqp); +out_unlock: mutex_unlock(&q->qi_quotaofflock); return error; } @@ -769,9 +776,12 @@ xfs_qm_scall_getquota( * gets turned off. No need to confuse the user level code, * so return zeroes in that case. */ - if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) || - (!XFS_IS_OQUOTA_ENFORCED(mp) && - (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) { + if ((!XFS_IS_UQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_USER) || + (!XFS_IS_GQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_GROUP) || + (!XFS_IS_PQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_PROJ)) { dst->d_btimer = 0; dst->d_itimer = 0; dst->d_rtbtimer = 0; @@ -779,8 +789,8 @@ xfs_qm_scall_getquota( #ifdef DEBUG if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) || - (XFS_IS_OQUOTA_ENFORCED(mp) && - (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && + (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) || + (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) && dst->d_id != 0) { if ((dst->d_bcount > dst->d_blk_softlimit) && (dst->d_blk_softlimit > 0)) { @@ -826,16 +836,16 @@ xfs_qm_export_flags( uflags = 0; if (flags & XFS_UQUOTA_ACCT) uflags |= FS_QUOTA_UDQ_ACCT; - if (flags & XFS_PQUOTA_ACCT) - uflags |= FS_QUOTA_PDQ_ACCT; if (flags & XFS_GQUOTA_ACCT) uflags |= FS_QUOTA_GDQ_ACCT; + if (flags & XFS_PQUOTA_ACCT) + uflags |= FS_QUOTA_PDQ_ACCT; if (flags & XFS_UQUOTA_ENFD) uflags |= FS_QUOTA_UDQ_ENFD; - if (flags & (XFS_OQUOTA_ENFD)) { - uflags |= (flags & XFS_GQUOTA_ACCT) ? - FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD; - } + if (flags & XFS_GQUOTA_ENFD) + uflags |= FS_QUOTA_GDQ_ENFD; + if (flags & XFS_PQUOTA_ENFD) + uflags |= FS_QUOTA_PDQ_ENFD; return (uflags); } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index b50ec5b95d5a..c3483bab9cde 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -77,9 +77,18 @@ typedef struct xfs_disk_dquot { */ typedef struct xfs_dqblk { xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ - char dd_fill[32]; /* filling for posterity */ + char dd_fill[4]; /* filling for posterity */ + + /* + * These two are only present on filesystems with the CRC bits set. + */ + __be32 dd_crc; /* checksum */ + __be64 dd_lsn; /* last modification in log */ + uuid_t dd_uuid; /* location information */ } xfs_dqblk_t; +#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) + /* * flags for q_flags field in the dquot. */ @@ -152,30 +161,42 @@ typedef struct xfs_qoff_logformat { #define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ /* + * Conversion to and from the combined OQUOTA flag (if necessary) + * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk() + */ +#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */ +#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */ +#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */ +#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */ + +/* * Quota Accounting/Enforcement flags */ #define XFS_ALL_QUOTA_ACCT \ (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) -#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD) -#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD) +#define XFS_ALL_QUOTA_ENFD \ + (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD) +#define XFS_ALL_QUOTA_CHKD \ + (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD) #define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) #define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) #define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) #define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) #define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) -#define XFS_IS_OQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_OQUOTA_ENFD) +#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD) +#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD) /* * Incore only flags for quotaoff - these bits get cleared when quota(s) * are in the process of getting turned off. These flags are in m_qflags but * never in sb_qflags. */ -#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */ -#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */ -#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */ +#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */ +#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */ +#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */ #define XFS_ALL_QUOTA_ACTIVE \ - (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE) + (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) /* * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees @@ -259,24 +280,23 @@ typedef struct xfs_qoff_logformat { ((XFS_IS_UQUOTA_ON(mp) && \ (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \ (XFS_IS_GQUOTA_ON(mp) && \ - ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ - (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \ + (mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \ (XFS_IS_PQUOTA_ON(mp) && \ - ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ - (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT)))) + (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0)) #define XFS_MOUNT_QUOTA_SET1 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ - XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ - XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD) + XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ + XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD) #define XFS_MOUNT_QUOTA_SET2 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ - XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ - XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD) + XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ + XFS_PQUOTA_ENFD|XFS_PQUOTA_CHKD) #define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ - XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ - XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\ - XFS_GQUOTA_ACCT) + XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ + XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\ + XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\ + XFS_PQUOTA_CHKD) /* @@ -380,5 +400,7 @@ extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *); extern int xfs_mount_reset_sbqflags(struct xfs_mount *); +extern const struct xfs_buf_ops xfs_dquot_buf_ops; + #endif /* __KERNEL__ */ #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 71926d630527..20e30f93b0c7 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -75,8 +75,10 @@ xfs_fs_set_xstate( flags |= XFS_GQUOTA_ACCT; if (uflags & FS_QUOTA_UDQ_ENFD) flags |= XFS_UQUOTA_ENFD; - if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD)) - flags |= XFS_OQUOTA_ENFD; + if (uflags & FS_QUOTA_GDQ_ENFD) + flags |= XFS_GQUOTA_ENFD; + if (uflags & FS_QUOTA_PDQ_ENFD) + flags |= XFS_PQUOTA_ENFD; switch (op) { case Q_XQUOTAON: diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index a05b45175fb0..78f9e70b80c7 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -32,6 +32,7 @@ struct xfs_mount; #define XFS_SB_VERSION_2 2 /* 6.2 - attributes */ #define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */ #define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */ +#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */ #define XFS_SB_VERSION_NUMBITS 0x000f #define XFS_SB_VERSION_ALLFBITS 0xfff0 #define XFS_SB_VERSION_SASHFBITS 0xf000 @@ -161,6 +162,20 @@ typedef struct xfs_sb { */ __uint32_t sb_bad_features2; + /* version 5 superblock fields start here */ + + /* feature masks */ + __uint32_t sb_features_compat; + __uint32_t sb_features_ro_compat; + __uint32_t sb_features_incompat; + __uint32_t sb_features_log_incompat; + + __uint32_t sb_crc; /* superblock crc */ + __uint32_t sb_pad; + + xfs_ino_t sb_pquotino; /* project quota inode */ + xfs_lsn_t sb_lsn; /* last write sequence */ + /* must be padded to 64 bit alignment */ } xfs_sb_t; @@ -229,7 +244,21 @@ typedef struct xfs_dsb { * for features2 bits. Easiest just to mark it bad and not use * it for anything else. */ - __be32 sb_bad_features2; + __be32 sb_bad_features2; + + /* version 5 superblock fields start here */ + + /* feature masks */ + __be32 sb_features_compat; + __be32 sb_features_ro_compat; + __be32 sb_features_incompat; + __be32 sb_features_log_incompat; + + __le32 sb_crc; /* superblock crc */ + __be32 sb_pad; + + __be64 sb_pquotino; /* project quota inode */ + __be64 sb_lsn; /* last write sequence */ /* must be padded to 64 bit alignment */ } xfs_dsb_t; @@ -250,7 +279,10 @@ typedef enum { XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, - XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, + XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT, + XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT, + XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD, + XFS_SBS_PQUOTINO, XFS_SBS_LSN, XFS_SBS_FIELDCOUNT } xfs_sb_field_t; @@ -276,6 +308,12 @@ typedef enum { #define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) #define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2) #define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2) +#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT) +#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT) +#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT) +#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT) +#define XFS_SB_CRC XFS_SB_MVAL(CRC) +#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO) #define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) #define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) #define XFS_SB_MOD_BITS \ @@ -283,7 +321,9 @@ typedef enum { XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ - XFS_SB_BAD_FEATURES2) + XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \ + XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \ + XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO) /* @@ -325,6 +365,8 @@ static inline int xfs_sb_good_version(xfs_sb_t *sbp) return 1; } + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) + return 1; return 0; } @@ -365,7 +407,7 @@ static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp) { return sbp->sb_versionnum == XFS_SB_VERSION_2 || sbp->sb_versionnum == XFS_SB_VERSION_3 || - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)); } @@ -373,7 +415,7 @@ static inline void xfs_sb_version_addattr(xfs_sb_t *sbp) { if (sbp->sb_versionnum == XFS_SB_VERSION_1) sbp->sb_versionnum = XFS_SB_VERSION_2; - else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + else if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4) sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; else sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT; @@ -382,7 +424,7 @@ static inline void xfs_sb_version_addattr(xfs_sb_t *sbp) static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp) { return sbp->sb_versionnum == XFS_SB_VERSION_3 || - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)); } @@ -396,13 +438,13 @@ static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp) static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); } static inline void xfs_sb_version_addquota(xfs_sb_t *sbp) { - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4) sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; else sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) | @@ -411,13 +453,14 @@ static inline void xfs_sb_version_addquota(xfs_sb_t *sbp) static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); } static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); } @@ -429,38 +472,42 @@ static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp) static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)); } static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT)); } static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)); } static inline int xfs_sb_version_hassector(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); } static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); } static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT)); } /* @@ -475,14 +522,16 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp) { - return xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); } static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp) { - return xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); } static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) @@ -500,14 +549,79 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp) static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) { - return xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); } static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) { - return (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT)); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + + +/* + * Extended v5 superblock feature masks. These are to be used for new v5 + * superblock features only. + * + * Compat features are new features that old kernels will not notice or affect + * and so can mount read-write without issues. + * + * RO-Compat (read only) are features that old kernels can read but will break + * if they write. Hence only read-only mounts of such filesystems are allowed on + * kernels that don't support the feature bit. + * + * InCompat features are features which old kernels will not understand and so + * must not mount. + * + * Log-InCompat features are for changes to log formats or new transactions that + * can't be replayed on older kernels. The fields are set when the filesystem is + * mounted, and a clean unmount clears the fields. + */ +#define XFS_SB_FEAT_COMPAT_ALL 0 +#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL +static inline bool +xfs_sb_has_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_compat & feature) != 0; +} + +#define XFS_SB_FEAT_RO_COMPAT_ALL 0 +#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL +static inline bool +xfs_sb_has_ro_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_ro_compat & feature) != 0; +} + +#define XFS_SB_FEAT_INCOMPAT_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL +static inline bool +xfs_sb_has_incompat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_incompat & feature) != 0; +} + +#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL +static inline bool +xfs_sb_has_incompat_log_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_log_incompat & feature) != 0; +} + +static inline bool +xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) +{ + return (ino == sbp->sb_uquotino || ino == sbp->sb_gquotino); } /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ea341cea68cb..1d68ffcdeaa7 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -51,6 +51,7 @@ #include "xfs_inode_item.h" #include "xfs_icache.h" #include "xfs_trace.h" +#include "xfs_icreate_item.h" #include <linux/namei.h> #include <linux/init.h> @@ -359,17 +360,17 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_PQUOTA) || !strcmp(this_char, MNTOPT_PRJQUOTA)) { mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | - XFS_OQUOTA_ENFD); + XFS_PQUOTA_ENFD); } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_OQUOTA_ENFD; + mp->m_qflags &= ~XFS_PQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_GQUOTA) || !strcmp(this_char, MNTOPT_GRPQUOTA)) { mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | - XFS_OQUOTA_ENFD); + XFS_GQUOTA_ENFD); } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_OQUOTA_ENFD; + mp->m_qflags &= ~XFS_GQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { xfs_warn(mp, "delaylog is the default now, option is deprecated."); @@ -439,20 +440,15 @@ xfs_parseargs( } done: - if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) { + if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) { /* * At this point the superblock has not been read * in, therefore we do not know the block size. * Before the mount call ends we will convert * these to FSBs. */ - if (dsunit) { - mp->m_dalign = dsunit; - mp->m_flags |= XFS_MOUNT_RETERR; - } - - if (dswidth) - mp->m_swidth = dswidth; + mp->m_dalign = dsunit; + mp->m_swidth = dswidth; } if (mp->m_logbufs != -1 && @@ -563,12 +559,12 @@ xfs_showargs( /* Either project or group quotas can be active, not both */ if (mp->m_qflags & XFS_PQUOTA_ACCT) { - if (mp->m_qflags & XFS_OQUOTA_ENFD) + if (mp->m_qflags & XFS_PQUOTA_ENFD) seq_puts(m, "," MNTOPT_PRJQUOTA); else seq_puts(m, "," MNTOPT_PQUOTANOENF); } else if (mp->m_qflags & XFS_GQUOTA_ACCT) { - if (mp->m_qflags & XFS_OQUOTA_ENFD) + if (mp->m_qflags & XFS_GQUOTA_ENFD) seq_puts(m, "," MNTOPT_GRPQUOTA); else seq_puts(m, "," MNTOPT_GQUOTANOENF); @@ -1136,8 +1132,8 @@ xfs_fs_statfs( spin_unlock(&mp->m_sb_lock); if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) == - (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) + ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == + (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD)) xfs_qm_statvfs(ip, statp); return 0; } @@ -1373,6 +1369,17 @@ xfs_finish_flags( } /* + * V5 filesystems always use attr2 format for attributes. + */ + if (xfs_sb_version_hascrc(&mp->m_sb) && + (mp->m_flags & XFS_MOUNT_NOATTR2)) { + xfs_warn(mp, +"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.", + MNTOPT_NOATTR2, MNTOPT_ATTR2); + return XFS_ERROR(EINVAL); + } + + /* * mkfs'ed attr2 will turn on attr2 mount unless explicitly * told by noattr2 to turn it off */ @@ -1470,6 +1477,10 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); + /* version 5 superblocks support inode version counters. */ + if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) + sb->s_flags |= MS_I_VERSION; + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1644,9 +1655,15 @@ xfs_init_zones(void) KM_ZONE_SPREAD, NULL); if (!xfs_ili_zone) goto out_destroy_inode_zone; + xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item), + "xfs_icr"); + if (!xfs_icreate_zone) + goto out_destroy_ili_zone; return 0; + out_destroy_ili_zone: + kmem_zone_destroy(xfs_ili_zone); out_destroy_inode_zone: kmem_zone_destroy(xfs_inode_zone); out_destroy_efi_zone: @@ -1685,6 +1702,7 @@ xfs_destroy_zones(void) * destroy caches. */ rcu_barrier(); + kmem_zone_destroy(xfs_icreate_zone); kmem_zone_destroy(xfs_ili_zone); kmem_zone_destroy(xfs_inode_zone); kmem_zone_destroy(xfs_efi_zone); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c new file mode 100644 index 000000000000..e830fb56e27f --- /dev/null +++ b/fs/xfs/xfs_symlink.c @@ -0,0 +1,767 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012-2013 Red Hat, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_itable.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_utils.h" +#include "xfs_trans_space.h" +#include "xfs_log_priv.h" +#include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" + + +/* + * Each contiguous block has a header, so it is not just a simple pathlen + * to FSB conversion. + */ +int +xfs_symlink_blocks( + struct xfs_mount *mp, + int pathlen) +{ + int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + + return (pathlen + buflen - 1) / buflen; +} + +static int +xfs_symlink_hdr_set( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return 0; + + dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC); + dsl->sl_offset = cpu_to_be32(offset); + dsl->sl_bytes = cpu_to_be32(size); + uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid); + dsl->sl_owner = cpu_to_be64(ino); + dsl->sl_blkno = cpu_to_be64(bp->b_bn); + bp->b_ops = &xfs_symlink_buf_ops; + + return sizeof(struct xfs_dsymlink_hdr); +} + +/* + * Checking of the symlink header is split into two parts. the verifier does + * CRC, location and bounds checking, the unpacking function checks the path + * parameters and owner. + */ +bool +xfs_symlink_hdr_ok( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (offset != be32_to_cpu(dsl->sl_offset)) + return false; + if (size != be32_to_cpu(dsl->sl_bytes)) + return false; + if (ino != be64_to_cpu(dsl->sl_owner)) + return false; + + /* ok */ + return true; +} + +static bool +xfs_symlink_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) + return false; + if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) + return false; + if (be32_to_cpu(dsl->sl_offset) + + be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN) + return false; + if (dsl->sl_owner == 0) + return false; + + return true; +} + +static void +xfs_symlink_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_dsymlink_hdr, sl_crc)) || + !xfs_symlink_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_symlink_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_symlink_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (bip) { + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_dsymlink_hdr, sl_crc)); +} + +const struct xfs_buf_ops xfs_symlink_buf_ops = { + .verify_read = xfs_symlink_read_verify, + .verify_write = xfs_symlink_write_verify, +}; + +void +xfs_symlink_local_to_remote( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp) +{ + struct xfs_mount *mp = ip->i_mount; + char *buf; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) { + bp->b_ops = NULL; + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + return; + } + + /* + * As this symlink fits in an inode literal area, it must also fit in + * the smallest buffer the filesystem supports. + */ + ASSERT(BBTOB(bp->b_length) >= + ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr)); + + bp->b_ops = &xfs_symlink_buf_ops; + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); + memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes); +} + +/* ----- Kernel only functions below ----- */ +STATIC int +xfs_readlink_bmap( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_buf *bp; + xfs_daddr_t d; + char *cur_chunk; + int pathlen = ip->i_d.di_size; + int nmaps = XFS_SYMLINK_MAPS; + int byte_cnt; + int n; + int error = 0; + int fsblocks = 0; + int offset; + + fsblocks = xfs_symlink_blocks(mp, pathlen); + error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); + if (error) + goto out; + + offset = 0; + for (n = 0; n < nmaps; n++) { + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, + &xfs_symlink_buf_ops); + if (!bp) + return XFS_ERROR(ENOMEM); + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); + goto out; + } + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + if (pathlen < byte_cnt) + byte_cnt = pathlen; + + cur_chunk = bp->b_addr; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_symlink_hdr_ok(mp, ip->i_ino, offset, + byte_cnt, bp)) { + error = EFSCORRUPTED; + xfs_alert(mp, +"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", + offset, byte_cnt, ip->i_ino); + xfs_buf_relse(bp); + goto out; + + } + + cur_chunk += sizeof(struct xfs_dsymlink_hdr); + } + + memcpy(link + offset, bp->b_addr, byte_cnt); + + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_buf_relse(bp); + } + ASSERT(pathlen == 0); + + link[ip->i_d.di_size] = '\0'; + error = 0; + + out: + return error; +} + +int +xfs_readlink( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t pathlen; + int error = 0; + + trace_xfs_readlink(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + + pathlen = ip->i_d.di_size; + if (!pathlen) + goto out; + + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", + __func__, (unsigned long long) ip->i_ino, + (long long) pathlen); + ASSERT(0); + error = XFS_ERROR(EFSCORRUPTED); + goto out; + } + + + if (ip->i_df.if_flags & XFS_IFINLINE) { + memcpy(link, ip->i_df.if_u1.if_data, pathlen); + link[pathlen] = '\0'; + } else { + error = xfs_readlink_bmap(ip, link); + } + + out: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +int +xfs_symlink( + struct xfs_inode *dp, + struct xfs_name *link_name, + const char *target_path, + umode_t mode, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_trans *tp = NULL; + struct xfs_inode *ip = NULL; + int error = 0; + int pathlen; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + bool unlock_dp_on_error = false; + uint cancel_flags; + int committed; + xfs_fileoff_t first_fsb; + xfs_filblks_t fs_blocks; + int nmaps; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + xfs_daddr_t d; + const char *cur_chunk; + int byte_cnt; + int n; + xfs_buf_t *bp; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + uint resblks; + + *ipp = NULL; + + trace_xfs_symlink(dp, link_name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* + * Check component lengths of the target path name. + */ + pathlen = strlen(target_path); + if (pathlen >= MAXPATHLEN) /* total string too long */ + return XFS_ERROR(ENAMETOOLONG); + + udqp = gdqp = NULL; + if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + prid = xfs_get_projid(dp); + else + prid = XFS_PROJID_DEFAULT; + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + /* + * The symlink will fit into the inode data fork? + * There can't be any attributes so we get the whole variable part. + */ + if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) + fs_blocks = 0; + else + fs_blocks = xfs_symlink_blocks(mp, pathlen); + resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); + error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); + if (error == ENOSPC && fs_blocks == 0) { + resblks = 0; + error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); + } + if (error) { + cancel_flags = 0; + goto error_return; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = true; + + /* + * Check whether the directory allows new symlinks or not. + */ + if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { + error = XFS_ERROR(EPERM); + goto error_return; + } + + /* + * Reserve disk quota : blocks and inode. + */ + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0); + if (error) + goto error_return; + + /* + * Check for ability to enter directory entry, if no space reserved. + */ + error = xfs_dir_canenter(tp, dp, link_name, resblks); + if (error) + goto error_return; + /* + * Initialize the bmap freelist prior to calling either + * bmapi or the directory create code. + */ + xfs_bmap_init(&free_list, &first_block); + + /* + * Allocate an inode for the symlink. + */ + error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, + prid, resblks > 0, &ip, NULL); + if (error) { + if (error == ENOSPC) + goto error_return; + goto error1; + } + + /* + * An error after we've joined dp to the transaction will result in the + * transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + unlock_dp_on_error = false; + + /* + * Also attach the dquot(s) to it, if applicable. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); + + if (resblks) + resblks -= XFS_IALLOC_SPACE_RES(mp); + /* + * If the symlink will fit into the inode, write it inline. + */ + if (pathlen <= XFS_IFORK_DSIZE(ip)) { + xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); + memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); + ip->i_d.di_size = pathlen; + + /* + * The inode was initially created in extent format. + */ + ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); + ip->i_df.if_flags |= XFS_IFINLINE; + + ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); + + } else { + int offset; + + first_fsb = 0; + nmaps = XFS_SYMLINK_MAPS; + + error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, + XFS_BMAPI_METADATA, &first_block, resblks, + mval, &nmaps, &free_list); + if (error) + goto error2; + + if (resblks) + resblks -= fs_blocks; + ip->i_d.di_size = pathlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + cur_chunk = target_path; + offset = 0; + for (n = 0; n < nmaps; n++) { + char *buf; + + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0); + if (!bp) { + error = ENOMEM; + goto error2; + } + bp->b_ops = &xfs_symlink_buf_ops; + + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + byte_cnt = min(byte_cnt, pathlen); + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, + byte_cnt, bp); + + memcpy(buf, cur_chunk, byte_cnt); + + cur_chunk += byte_cnt; + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - + (char *)bp->b_addr); + } + ASSERT(pathlen == 0); + } + + /* + * Create the directory entry for the symlink. + */ + error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) + goto error2; + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + /* + * If this is a synchronous mount, make sure that the + * symlink transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error2; + } + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + *ipp = ip; + return 0; + + error2: + IRELE(ip); + error1: + xfs_bmap_cancel(&free_list); + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_trans_cancel(tp, cancel_flags); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + std_return: + return error; +} + +/* + * Free a symlink that has blocks associated with it. + */ +STATIC int +xfs_inactive_symlink_rmt( + xfs_inode_t *ip, + xfs_trans_t **tpp) +{ + xfs_buf_t *bp; + int committed; + int done; + int error; + xfs_fsblock_t first_block; + xfs_bmap_free_t free_list; + int i; + xfs_mount_t *mp; + xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS]; + int nmaps; + xfs_trans_t *ntp; + int size; + xfs_trans_t *tp; + + tp = *tpp; + mp = ip->i_mount; + ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS); + /* + * We're freeing a symlink that has some + * blocks allocated to it. Free the + * blocks here. We know that we've got + * either 1 or 2 extents and that we can + * free them all in one bunmapi call. + */ + ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); + + /* + * Lock the inode, fix the size, and join it to the transaction. + * Hold it so in the normal path, we still have it locked for + * the second transaction. In the error paths we need it + * held so the cancel won't rele it, see below. + */ + size = (int)ip->i_d.di_size; + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Find the block(s) so we can inval and unmap them. + */ + done = 0; + xfs_bmap_init(&free_list, &first_block); + nmaps = ARRAY_SIZE(mval); + error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size), + mval, &nmaps, 0); + if (error) + goto error0; + /* + * Invalidate the block(s). No validation is done. + */ + for (i = 0; i < nmaps; i++) { + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); + if (!bp) { + error = ENOMEM; + goto error1; + } + xfs_trans_binval(tp, bp); + } + /* + * Unmap the dead block(s) to the free_list. + */ + if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, + &first_block, &free_list, &done))) + goto error1; + ASSERT(done); + /* + * Commit the first transaction. This logs the EFI and the inode. + */ + if ((error = xfs_bmap_finish(&tp, &free_list, &committed))) + goto error1; + /* + * The transaction must have been committed, since there were + * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. + * The new tp has the extent freeing and EFDs. + */ + ASSERT(committed); + /* + * The first xact was committed, so add the inode to the new one. + * Mark it dirty so it will be logged and moved forward in the log as + * part of every commit. + */ + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Get a new, empty transaction to return to our caller. + */ + ntp = xfs_trans_dup(tp); + /* + * Commit the transaction containing extent freeing and EFDs. + * If we get an error on the commit here or on the reserve below, + * we need to unlock the inode since the new transaction doesn't + * have the inode attached. + */ + error = xfs_trans_commit(tp, 0); + tp = ntp; + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + goto error0; + } + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(tp->t_ticket); + + /* + * Remove the memory for extent descriptions (just bookkeeping). + */ + if (ip->i_df.if_bytes) + xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); + ASSERT(ip->i_df.if_bytes == 0); + /* + * Put an itruncate log reservation in the new transaction + * for our caller. + */ + if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + goto error0; + } + + xfs_trans_ijoin(tp, ip, 0); + *tpp = tp; + return 0; + + error1: + xfs_bmap_cancel(&free_list); + error0: + return error; +} + +/* + * xfs_inactive_symlink - free a symlink + */ +int +xfs_inactive_symlink( + struct xfs_inode *ip, + struct xfs_trans **tp) +{ + struct xfs_mount *mp = ip->i_mount; + int pathlen; + + trace_xfs_inactive_symlink(ip); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* + * Zero length symlinks _can_ exist. + */ + pathlen = (int)ip->i_d.di_size; + if (!pathlen) + return 0; + + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)", + __func__, (unsigned long long)ip->i_ino, pathlen); + ASSERT(0); + return XFS_ERROR(EFSCORRUPTED); + } + + if (ip->i_df.if_flags & XFS_IFINLINE) { + if (ip->i_df.if_bytes > 0) + xfs_idata_realloc(ip, -(ip->i_df.if_bytes), + XFS_DATA_FORK); + ASSERT(ip->i_df.if_bytes == 0); + return 0; + } + + /* remove the remote symlink */ + return xfs_inactive_symlink_rmt(ip, tp); +} diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h new file mode 100644 index 000000000000..374394880c01 --- /dev/null +++ b/fs/xfs/xfs_symlink.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2012 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SYMLINK_H +#define __XFS_SYMLINK_H 1 + +struct xfs_mount; +struct xfs_trans; +struct xfs_inode; +struct xfs_buf; +struct xfs_ifork; +struct xfs_name; + +#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */ + +struct xfs_dsymlink_hdr { + __be32 sl_magic; + __be32 sl_offset; + __be32 sl_bytes; + __be32 sl_crc; + uuid_t sl_uuid; + __be64 sl_owner; + __be64 sl_blkno; + __be64 sl_lsn; +}; + +/* + * The maximum pathlen is 1024 bytes. Since the minimum file system + * blocksize is 512 bytes, we can get a max of 3 extents back from + * bmapi when crc headers are taken into account. + */ +#define XFS_SYMLINK_MAPS 3 + +#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_dsymlink_hdr) : 0)) + +int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); + +void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp); + +extern const struct xfs_buf_ops xfs_symlink_buf_ops; + +#ifdef __KERNEL__ + +int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, + const char *target_path, umode_t mode, struct xfs_inode **ipp); +int xfs_readlink(struct xfs_inode *ip, char *link); +int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp); + +#endif /* __KERNEL__ */ +#endif /* __XFS_SYMLINK_H */ diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 2801b5ce6cdb..1743b9f8e23d 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -25,11 +25,11 @@ static struct ctl_table_header *xfs_table_header; #ifdef CONFIG_PROC_FS STATIC int xfs_stats_clear_proc_handler( - ctl_table *ctl, - int write, - void __user *buffer, - size_t *lenp, - loff_t *ppos) + struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) { int c, ret, *valp = ctl->data; __uint32_t vn_active; @@ -55,11 +55,11 @@ xfs_stats_clear_proc_handler( STATIC int xfs_panic_mask_proc_handler( - ctl_table *ctl, - int write, - void __user *buffer, - size_t *lenp, - loff_t *ppos) + struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) { int ret, *valp = ctl->data; @@ -74,7 +74,7 @@ xfs_panic_mask_proc_handler( } #endif /* CONFIG_PROC_FS */ -static ctl_table xfs_table[] = { +static struct ctl_table xfs_table[] = { { .procname = "irix_sgid_inherit", .data = &xfs_params.sgid_inherit.val, @@ -227,7 +227,7 @@ static ctl_table xfs_table[] = { {} }; -static ctl_table xfs_dir_table[] = { +static struct ctl_table xfs_dir_table[] = { { .procname = "xfs", .mode = 0555, @@ -236,7 +236,7 @@ static ctl_table xfs_dir_table[] = { {} }; -static ctl_table xfs_root_table[] = { +static struct ctl_table xfs_root_table[] = { { .procname = "fs", .mode = 0555, diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 624bedd81357..b6e3897c1d9f 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -22,7 +22,6 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" @@ -30,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_mount.h" +#include "xfs_da_btree.h" #include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_alloc.h" diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 16a812977eab..47910e638c18 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -486,9 +486,12 @@ DEFINE_EVENT(xfs_buf_item_class, name, \ TP_PROTO(struct xfs_buf_log_item *bip), \ TP_ARGS(bip)) DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); @@ -508,6 +511,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); +DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered); DECLARE_EVENT_CLASS(xfs_lock_class, TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, @@ -571,6 +575,7 @@ DEFINE_INODE_EVENT(xfs_iget_miss); DEFINE_INODE_EVENT(xfs_getattr); DEFINE_INODE_EVENT(xfs_setattr); DEFINE_INODE_EVENT(xfs_readlink); +DEFINE_INODE_EVENT(xfs_inactive_symlink); DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); DEFINE_INODE_EVENT(xfs_readdir); @@ -619,6 +624,30 @@ DECLARE_EVENT_CLASS(xfs_iref_class, (char *)__entry->caller_ip) ) +TRACE_EVENT(xfs_iomap_prealloc_size, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t blocks, int shift, + unsigned int writeio_blocks), + TP_ARGS(ip, blocks, shift, writeio_blocks), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, blocks) + __field(int, shift) + __field(unsigned int, writeio_blocks) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->blocks = blocks; + __entry->shift = shift; + __entry->writeio_blocks = writeio_blocks; + ), + TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d " + "m_writeio_blocks %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, + __entry->blocks, __entry->shift, __entry->writeio_blocks) +) + #define DEFINE_IREF_EVENT(name) \ DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ @@ -950,14 +979,16 @@ DEFINE_RW_EVENT(xfs_file_splice_read); DEFINE_RW_EVENT(xfs_file_splice_write); DECLARE_EVENT_CLASS(xfs_page_class, - TP_PROTO(struct inode *inode, struct page *page, unsigned long off), - TP_ARGS(inode, page, off), + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, + unsigned int len), + TP_ARGS(inode, page, off, len), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(pgoff_t, pgoff) __field(loff_t, size) __field(unsigned long, offset) + __field(unsigned int, length) __field(int, delalloc) __field(int, unwritten) ), @@ -971,24 +1002,27 @@ DECLARE_EVENT_CLASS(xfs_page_class, __entry->pgoff = page_offset(page); __entry->size = i_size_read(inode); __entry->offset = off; + __entry->length = len; __entry->delalloc = delalloc; __entry->unwritten = unwritten; ), TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " - "delalloc %d unwritten %d", + "length %x delalloc %d unwritten %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pgoff, __entry->size, __entry->offset, + __entry->length, __entry->delalloc, __entry->unwritten) ) #define DEFINE_PAGE_EVENT(name) \ DEFINE_EVENT(xfs_page_class, name, \ - TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ - TP_ARGS(inode, page, off)) + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ + unsigned int len), \ + TP_ARGS(inode, page, off, len)) DEFINE_PAGE_EVENT(xfs_writepage); DEFINE_PAGE_EVENT(xfs_releasepage); DEFINE_PAGE_EVENT(xfs_invalidatepage); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 2fd7c1ff1d21..35a229981354 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -234,71 +234,93 @@ xfs_calc_remove_reservation( } /* - * For symlink we can modify: + * For create, break it in to the two cases that the transaction + * covers. We start with the modify case - allocation done by modification + * of the state of existing inodes - and the allocation case. + */ + +/* + * For create we can modify: * the parent directory inode: inode size * the new inode: inode size - * the inode btree entry: 1 block + * the inode btree entry: block size + * the superblock for the nlink flag: sector size * the directory btree: (max depth + v2) * dir block size * the directory inode's bmap btree: (max depth + v2) * block size - * the blocks for the symlink: 1 kB - * Or in the first xact we allocate some inodes giving: + */ +STATIC uint +xfs_calc_create_resv_modify( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + (uint)XFS_FSB_TO_B(mp, 1) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); +} + +/* + * For create we can allocate some inodes giving: * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + * the allocation btrees: 2 trees * (max depth - 1) * block size */ STATIC uint -xfs_calc_symlink_reservation( +xfs_calc_create_resv_alloc( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + mp->m_sb.sb_sectsize + + xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +STATIC uint +__xfs_calc_create_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(1, 1024)), - (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(mp->m_in_maxlevels, - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)))); + MAX(xfs_calc_create_resv_alloc(mp), + xfs_calc_create_resv_modify(mp)); } /* - * For create we can modify: - * the parent directory inode: inode size - * the new inode: inode size - * the inode btree entry: block size - * the superblock for the nlink flag: sector size - * the directory btree: (max depth + v2) * dir block size - * the directory inode's bmap btree: (max depth + v2) * block size - * Or in the first xact we allocate some inodes giving: + * For icreate we can allocate some inodes giving: * the agi and agf of the ag getting the new inodes: 2 * sectorsize * the superblock for the nlink flag: sector size - * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize * the inode btree: max depth * blocksize * the allocation btrees: 2 trees * (max depth - 1) * block size */ STATIC uint -xfs_calc_create_reservation( +xfs_calc_icreate_resv_alloc( struct xfs_mount *mp) { + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + mp->m_sb.sb_sectsize + + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +STATIC uint +xfs_calc_icreate_reservation(xfs_mount_t *mp) +{ return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + - (uint)XFS_FSB_TO_B(mp, 1) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + - mp->m_sb.sb_sectsize + - xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(mp->m_in_maxlevels, - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)))); + MAX(xfs_calc_icreate_resv_alloc(mp), + xfs_calc_create_resv_modify(mp)); +} + +STATIC uint +xfs_calc_create_reservation( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return xfs_calc_icreate_reservation(mp); + return __xfs_calc_create_reservation(mp); + } /* @@ -311,6 +333,20 @@ xfs_calc_mkdir_reservation( return xfs_calc_create_reservation(mp); } + +/* + * Making a new symplink is the same as creating a new file, but + * with the added blocks for remote symlink data which can be up to 1kB in + * length (MAXPATHLEN). + */ +STATIC uint +xfs_calc_symlink_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_create_reservation(mp) + + xfs_calc_buf_res(1, MAXPATHLEN); +} + /* * In freeing an inode we can modify: * the inode being freed: inode size diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index cd29f6171021..2b4946393e30 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -48,6 +48,7 @@ typedef struct xfs_trans_header { #define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ #define XFS_LI_DQUOT 0x123d #define XFS_LI_QUOTAOFF 0x123e +#define XFS_LI_ICREATE 0x123f #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -107,7 +108,8 @@ typedef struct xfs_trans_header { #define XFS_TRANS_SWAPEXT 40 #define XFS_TRANS_SB_COUNT 41 #define XFS_TRANS_CHECKPOINT 42 -#define XFS_TRANS_TYPE_MAX 42 +#define XFS_TRANS_ICREATE 43 +#define XFS_TRANS_TYPE_MAX 43 /* new transaction types need to be reflected in xfs_logprint(8) */ #define XFS_TRANS_TYPES \ @@ -210,23 +212,18 @@ struct xfs_log_item_desc { /* * Per-extent log reservation for the allocation btree changes * involved in freeing or allocating an extent. - * 2 trees * (2 blocks/level * max depth - 1) * block size + * 2 trees * (2 blocks/level * max depth - 1) */ -#define XFS_ALLOCFREE_LOG_RES(mp,nx) \ - ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) #define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) /* * Per-directory log reservation for any directory change. - * dir blocks: (1 btree block per level + data block + free block) * dblock size - * bmap btree: (levels + 2) * max depth * block size + * dir blocks: (1 btree block per level + data block + free block) + * bmap btree: (levels + 2) * max depth * v2 directory blocks can be fragmented below the dirblksize down to the fsb * size, so account for that in the DAENTER macros. */ -#define XFS_DIROP_LOG_RES(mp) \ - (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \ - (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1))) #define XFS_DIROP_LOG_COUNT(mp) \ (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) @@ -405,7 +402,7 @@ typedef struct xfs_trans { int64_t t_res_fdblocks_delta; /* on-disk only chg */ int64_t t_frextents_delta;/* superblock freextents chg*/ int64_t t_res_frextents_delta; /* on-disk only chg */ -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int64_t t_ag_freeblks_delta; /* debugging counter */ int64_t t_ag_flist_delta; /* debugging counter */ int64_t t_ag_btree_delta; /* debugging counter */ @@ -433,7 +430,7 @@ typedef struct xfs_trans { #define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) #define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d) #define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d) #define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d) @@ -503,6 +500,7 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 3edf5dbee001..aa5a04b844d6 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -397,7 +397,6 @@ shutdown_abort: return XFS_ERROR(EIO); } - /* * Release the buffer bp which was previously acquired with one of the * xfs_trans_... buffer allocation routines if the buffer has not @@ -603,8 +602,14 @@ xfs_trans_log_buf(xfs_trans_t *tp, tp->t_flags |= XFS_TRANS_DIRTY; bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; - bip->bli_flags |= XFS_BLI_LOGGED; - xfs_buf_item_log(bip, first, last); + + /* + * If we have an ordered buffer we are not logging any dirty range but + * it still needs to be marked dirty and that it has been logged. + */ + bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; + if (!(bip->bli_flags & XFS_BLI_ORDERED)) + xfs_buf_item_log(bip, first, last); } @@ -659,6 +664,7 @@ xfs_trans_binval( ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK)); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); @@ -671,6 +677,7 @@ xfs_trans_binval( bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; + bip->__bli_format.blf_flags &= ~XFS_BLFT_MASK; for (i = 0; i < bip->bli_format_count; i++) { memset(bip->bli_formats[i].blf_data_map, 0, (bip->bli_formats[i].blf_map_size * sizeof(uint))); @@ -702,12 +709,13 @@ xfs_trans_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* * This call is used to indicate that the buffer is going to * be staled and was an inode buffer. This means it gets - * special processing during unpin - where any inodes + * special processing during unpin - where any inodes * associated with the buffer should be removed from ail. * There is also special processing during recovery, * any replay of the inodes in the buffer needs to be @@ -726,6 +734,7 @@ xfs_trans_stale_inode_buf( bip->bli_flags |= XFS_BLI_STALE_INODE; bip->bli_item.li_cb = xfs_buf_iodone; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* @@ -749,8 +758,66 @@ xfs_trans_inode_alloc_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); +} + +/* + * Mark the buffer as ordered for this transaction. This means + * that the contents of the buffer are not recorded in the transaction + * but it is tracked in the AIL as though it was. This allows us + * to record logical changes in transactions rather than the physical + * changes we make to the buffer without changing writeback ordering + * constraints of metadata buffers. + */ +void +xfs_trans_ordered_buf( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_ORDERED; + trace_xfs_buf_item_ordered(bip); +} + +/* + * Set the type of the buffer for log recovery so that it can correctly identify + * and hence attach the correct buffer ops to the buffer after replay. + */ +void +xfs_trans_buf_set_type( + struct xfs_trans *tp, + struct xfs_buf *bp, + enum xfs_blft type) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!tp) + return; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + xfs_blft_to_flags(&bip->__bli_format, type); } +void +xfs_trans_buf_copy_type( + struct xfs_buf *dst_bp, + struct xfs_buf *src_bp) +{ + struct xfs_buf_log_item *sbip = src_bp->b_fspriv; + struct xfs_buf_log_item *dbip = dst_bp->b_fspriv; + enum xfs_blft type; + + type = xfs_blft_from_flags(&sbip->__bli_format); + xfs_blft_to_flags(&dbip->__bli_format, type); +} /* * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of @@ -769,14 +836,28 @@ xfs_trans_dquot_buf( xfs_buf_t *bp, uint type) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_fspriv; - ASSERT(bp->b_transp == tp); - ASSERT(bip != NULL); ASSERT(type == XFS_BLF_UDQUOT_BUF || type == XFS_BLF_PDQUOT_BUF || type == XFS_BLF_GDQUOT_BUF); - ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->__bli_format.blf_flags |= type; + + switch (type) { + case XFS_BLF_UDQUOT_BUF: + type = XFS_BLFT_UDQUOT_BUF; + break; + case XFS_BLF_PDQUOT_BUF: + type = XFS_BLFT_PDQUOT_BUF; + break; + case XFS_BLF_GDQUOT_BUF: + type = XFS_BLFT_GDQUOT_BUF; + break; + default: + type = XFS_BLFT_UNKNOWN_BUF; + break; + } + + xfs_trans_buf_set_type(tp, bp, type); } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 642c2d6e1db1..3ba64d540168 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -103,8 +103,6 @@ xfs_trans_dup_dqinfo( return; xfs_trans_alloc_dqinfo(ntp); - oqa = otp->t_dqinfo->dqa_usrdquots; - nqa = ntp->t_dqinfo->dqa_usrdquots; /* * Because the quota blk reservation is carried forward, @@ -113,7 +111,9 @@ xfs_trans_dup_dqinfo( if(otp->t_flags & XFS_TRANS_DQ_DIRTY) ntp->t_flags |= XFS_TRANS_DQ_DIRTY; - for (j = 0; j < 2; j++) { + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + oqa = otp->t_dqinfo->dqs[j]; + nqa = ntp->t_dqinfo->dqs[j]; for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { if (oqa[i].qt_dquot == NULL) break; @@ -138,8 +138,6 @@ xfs_trans_dup_dqinfo( oq->qt_ino_res = oq->qt_ino_res_used; } - oqa = otp->t_dqinfo->dqa_grpdquots; - nqa = ntp->t_dqinfo->dqa_grpdquots; } } @@ -157,8 +155,7 @@ xfs_trans_mod_dquot_byino( if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp) || - ip->i_ino == mp->m_sb.sb_uquotino || - ip->i_ino == mp->m_sb.sb_gquotino) + xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) return; if (tp->t_dqinfo == NULL) @@ -170,16 +167,18 @@ xfs_trans_mod_dquot_byino( (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta); } -STATIC xfs_dqtrx_t * +STATIC struct xfs_dqtrx * xfs_trans_get_dqtrx( - xfs_trans_t *tp, - xfs_dquot_t *dqp) + struct xfs_trans *tp, + struct xfs_dquot *dqp) { - int i; - xfs_dqtrx_t *qa; + int i; + struct xfs_dqtrx *qa; - qa = XFS_QM_ISUDQ(dqp) ? - tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots; + if (XFS_QM_ISUDQ(dqp)) + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR]; + else + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP]; for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { if (qa[i].qt_dquot == NULL || @@ -326,12 +325,12 @@ xfs_trans_dqlockedjoin( */ void xfs_trans_apply_dquot_deltas( - xfs_trans_t *tp) + struct xfs_trans *tp) { int i, j; - xfs_dquot_t *dqp; - xfs_dqtrx_t *qtrx, *qa; - xfs_disk_dquot_t *d; + struct xfs_dquot *dqp; + struct xfs_dqtrx *qtrx, *qa; + struct xfs_disk_dquot *d; long totalbdelta; long totalrtbdelta; @@ -339,12 +338,10 @@ xfs_trans_apply_dquot_deltas( return; ASSERT(tp->t_dqinfo); - qa = tp->t_dqinfo->dqa_usrdquots; - for (j = 0; j < 2; j++) { - if (qa[0].qt_dquot == NULL) { - qa = tp->t_dqinfo->dqa_grpdquots; + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + qa = tp->t_dqinfo->dqs[j]; + if (qa[0].qt_dquot == NULL) continue; - } /* * Lock all of the dquots and join them to the transaction. @@ -412,7 +409,7 @@ xfs_trans_apply_dquot_deltas( * Start/reset the timer(s) if needed. */ if (d->d_id) { - xfs_qm_adjust_dqlimits(tp->t_mountp, d); + xfs_qm_adjust_dqlimits(tp->t_mountp, dqp); xfs_qm_adjust_dqtimers(tp->t_mountp, d); } @@ -495,10 +492,6 @@ xfs_trans_apply_dquot_deltas( ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); } - /* - * Do the group quotas next - */ - qa = tp->t_dqinfo->dqa_grpdquots; } } @@ -521,9 +514,9 @@ xfs_trans_unreserve_and_mod_dquots( if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) return; - qa = tp->t_dqinfo->dqa_usrdquots; + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + qa = tp->t_dqinfo->dqs[j]; - for (j = 0; j < 2; j++) { for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { qtrx = &qa[i]; /* @@ -565,7 +558,6 @@ xfs_trans_unreserve_and_mod_dquots( xfs_dqunlock(dqp); } - qa = tp->t_dqinfo->dqa_grpdquots; } } @@ -640,8 +632,8 @@ xfs_trans_dqresv( if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_core.d_id && ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || - (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) && - (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) { + (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) || + (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) { if (nblks > 0) { /* * dquot is locked already. See if we'd go over the @@ -748,15 +740,15 @@ error_return: */ int xfs_trans_reserve_quota_bydquots( - xfs_trans_t *tp, - xfs_mount_t *mp, - xfs_dquot_t *udqp, - xfs_dquot_t *gdqp, - long nblks, - long ninos, - uint flags) + struct xfs_trans *tp, + struct xfs_mount *mp, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + long nblks, + long ninos, + uint flags) { - int resvd = 0, error; + int error; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; @@ -771,28 +763,24 @@ xfs_trans_reserve_quota_bydquots( (flags & ~XFS_QMOPT_ENOSPC)); if (error) return error; - resvd = 1; } if (gdqp) { error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags); - if (error) { - /* - * can't do it, so backout previous reservation - */ - if (resvd) { - flags |= XFS_QMOPT_FORCE_RES; - xfs_trans_dqresv(tp, mp, udqp, - -nblks, -ninos, flags); - } - return error; - } + if (error) + goto unwind_usr; } /* * Didn't change anything critical, so, no need to log */ return 0; + +unwind_usr: + flags |= XFS_QMOPT_FORCE_RES; + if (udqp) + xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags); + return error; } @@ -816,8 +804,7 @@ xfs_trans_reserve_quota_nblks( if (XFS_IS_PQUOTA_ON(mp)) flags |= XFS_QMOPT_ENOSPC; - ASSERT(ip->i_ino != mp->m_sb.sb_uquotino); - ASSERT(ip->i_ino != mp->m_sb.sb_gquotino); + ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index ac6d567704db..53dfe46f3680 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -112,6 +112,17 @@ xfs_trans_log_inode( ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + /* + * First time we log the inode in a transaction, bump the inode change + * counter if it is configured for this to occur. + */ + if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && + IS_I_VERSION(VFS_I(ip))) { + inode_inc_iversion(VFS_I(ip)); + ip->i_d.di_changecount = VFS_I(ip)->i_version; + flags |= XFS_ILOG_CORE; + } + tp->t_flags |= XFS_TRANS_DIRTY; ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 77ad74834baa..42c0ef288aeb 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -48,103 +49,8 @@ #include "xfs_vnodeops.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_symlink.h" -/* - * The maximum pathlen is 1024 bytes. Since the minimum file system - * blocksize is 512 bytes, we can get a max of 2 extents back from - * bmapi. - */ -#define SYMLINK_MAPS 2 - -STATIC int -xfs_readlink_bmap( - xfs_inode_t *ip, - char *link) -{ - xfs_mount_t *mp = ip->i_mount; - int pathlen = ip->i_d.di_size; - int nmaps = SYMLINK_MAPS; - xfs_bmbt_irec_t mval[SYMLINK_MAPS]; - xfs_daddr_t d; - int byte_cnt; - int n; - xfs_buf_t *bp; - int error = 0; - - error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps, - 0); - if (error) - goto out; - - for (n = 0; n < nmaps; n++) { - d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); - byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL); - if (!bp) - return XFS_ERROR(ENOMEM); - error = bp->b_error; - if (error) { - xfs_buf_ioerror_alert(bp, __func__); - xfs_buf_relse(bp); - goto out; - } - if (pathlen < byte_cnt) - byte_cnt = pathlen; - pathlen -= byte_cnt; - - memcpy(link, bp->b_addr, byte_cnt); - xfs_buf_relse(bp); - } - - link[ip->i_d.di_size] = '\0'; - error = 0; - - out: - return error; -} - -int -xfs_readlink( - xfs_inode_t *ip, - char *link) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_fsize_t pathlen; - int error = 0; - - trace_xfs_readlink(ip); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - xfs_ilock(ip, XFS_ILOCK_SHARED); - - pathlen = ip->i_d.di_size; - if (!pathlen) - goto out; - - if (pathlen < 0 || pathlen > MAXPATHLEN) { - xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", - __func__, (unsigned long long) ip->i_ino, - (long long) pathlen); - ASSERT(0); - error = XFS_ERROR(EFSCORRUPTED); - goto out; - } - - - if (ip->i_df.if_flags & XFS_IFINLINE) { - memcpy(link, ip->i_df.if_u1.if_data, pathlen); - link[pathlen] = '\0'; - } else { - error = xfs_readlink_bmap(ip, link); - } - - out: - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; -} /* * This is called by xfs_inactive to free any blocks beyond eof @@ -249,145 +155,6 @@ xfs_free_eofblocks( return error; } -/* - * Free a symlink that has blocks associated with it. - */ -STATIC int -xfs_inactive_symlink_rmt( - xfs_inode_t *ip, - xfs_trans_t **tpp) -{ - xfs_buf_t *bp; - int committed; - int done; - int error; - xfs_fsblock_t first_block; - xfs_bmap_free_t free_list; - int i; - xfs_mount_t *mp; - xfs_bmbt_irec_t mval[SYMLINK_MAPS]; - int nmaps; - xfs_trans_t *ntp; - int size; - xfs_trans_t *tp; - - tp = *tpp; - mp = ip->i_mount; - ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip)); - /* - * We're freeing a symlink that has some - * blocks allocated to it. Free the - * blocks here. We know that we've got - * either 1 or 2 extents and that we can - * free them all in one bunmapi call. - */ - ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); - - /* - * Lock the inode, fix the size, and join it to the transaction. - * Hold it so in the normal path, we still have it locked for - * the second transaction. In the error paths we need it - * held so the cancel won't rele it, see below. - */ - size = (int)ip->i_d.di_size; - ip->i_d.di_size = 0; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* - * Find the block(s) so we can inval and unmap them. - */ - done = 0; - xfs_bmap_init(&free_list, &first_block); - nmaps = ARRAY_SIZE(mval); - error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size), - mval, &nmaps, 0); - if (error) - goto error0; - /* - * Invalidate the block(s). - */ - for (i = 0; i < nmaps; i++) { - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), - XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); - if (!bp) { - error = ENOMEM; - goto error1; - } - xfs_trans_binval(tp, bp); - } - /* - * Unmap the dead block(s) to the free_list. - */ - if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, - &first_block, &free_list, &done))) - goto error1; - ASSERT(done); - /* - * Commit the first transaction. This logs the EFI and the inode. - */ - if ((error = xfs_bmap_finish(&tp, &free_list, &committed))) - goto error1; - /* - * The transaction must have been committed, since there were - * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. - * The new tp has the extent freeing and EFDs. - */ - ASSERT(committed); - /* - * The first xact was committed, so add the inode to the new one. - * Mark it dirty so it will be logged and moved forward in the log as - * part of every commit. - */ - xfs_trans_ijoin(tp, ip, 0); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* - * Get a new, empty transaction to return to our caller. - */ - ntp = xfs_trans_dup(tp); - /* - * Commit the transaction containing extent freeing and EFDs. - * If we get an error on the commit here or on the reserve below, - * we need to unlock the inode since the new transaction doesn't - * have the inode attached. - */ - error = xfs_trans_commit(tp, 0); - tp = ntp; - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - goto error0; - } - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - - /* - * Remove the memory for extent descriptions (just bookkeeping). - */ - if (ip->i_df.if_bytes) - xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); - ASSERT(ip->i_df.if_bytes == 0); - /* - * Put an itruncate log reservation in the new transaction - * for our caller. - */ - if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - goto error0; - } - - xfs_trans_ijoin(tp, ip, 0); - *tpp = tp; - return 0; - - error1: - xfs_bmap_cancel(&free_list); - error0: - return error; -} - int xfs_release( xfs_inode_t *ip) @@ -555,18 +322,9 @@ xfs_inactive( xfs_trans_ijoin(tp, ip, 0); if (S_ISLNK(ip->i_d.di_mode)) { - /* - * Zero length symlinks _can_ exist. - */ - if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) { - error = xfs_inactive_symlink_rmt(ip, &tp); - if (error) - goto out_cancel; - } else if (ip->i_df.if_bytes > 0) { - xfs_idata_realloc(ip, -(ip->i_df.if_bytes), - XFS_DATA_FORK); - ASSERT(ip->i_df.if_bytes == 0); - } + error = xfs_inactive_symlink(ip, &tp); + if (error) + goto out_cancel; } else if (truncate) { ip->i_d.di_size = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1353,247 +1111,6 @@ xfs_link( } int -xfs_symlink( - xfs_inode_t *dp, - struct xfs_name *link_name, - const char *target_path, - umode_t mode, - xfs_inode_t **ipp) -{ - xfs_mount_t *mp = dp->i_mount; - xfs_trans_t *tp; - xfs_inode_t *ip; - int error; - int pathlen; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - bool unlock_dp_on_error = false; - uint cancel_flags; - int committed; - xfs_fileoff_t first_fsb; - xfs_filblks_t fs_blocks; - int nmaps; - xfs_bmbt_irec_t mval[SYMLINK_MAPS]; - xfs_daddr_t d; - const char *cur_chunk; - int byte_cnt; - int n; - xfs_buf_t *bp; - prid_t prid; - struct xfs_dquot *udqp, *gdqp; - uint resblks; - - *ipp = NULL; - error = 0; - ip = NULL; - tp = NULL; - - trace_xfs_symlink(dp, link_name); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - /* - * Check component lengths of the target path name. - */ - pathlen = strlen(target_path); - if (pathlen >= MAXPATHLEN) /* total string too long */ - return XFS_ERROR(ENAMETOOLONG); - - udqp = gdqp = NULL; - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = xfs_get_projid(dp); - else - prid = XFS_PROJID_DEFAULT; - - /* - * Make sure that we have allocated dquot(s) on disk. - */ - error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); - if (error) - goto std_return; - - tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - /* - * The symlink will fit into the inode data fork? - * There can't be any attributes so we get the whole variable part. - */ - if (pathlen <= XFS_LITINO(mp)) - fs_blocks = 0; - else - fs_blocks = XFS_B_TO_FSB(mp, pathlen); - resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); - error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); - if (error == ENOSPC && fs_blocks == 0) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); - } - if (error) { - cancel_flags = 0; - goto error_return; - } - - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - unlock_dp_on_error = true; - - /* - * Check whether the directory allows new symlinks or not. - */ - if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { - error = XFS_ERROR(EPERM); - goto error_return; - } - - /* - * Reserve disk quota : blocks and inode. - */ - error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0); - if (error) - goto error_return; - - /* - * Check for ability to enter directory entry, if no space reserved. - */ - error = xfs_dir_canenter(tp, dp, link_name, resblks); - if (error) - goto error_return; - /* - * Initialize the bmap freelist prior to calling either - * bmapi or the directory create code. - */ - xfs_bmap_init(&free_list, &first_block); - - /* - * Allocate an inode for the symlink. - */ - error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, - prid, resblks > 0, &ip, NULL); - if (error) { - if (error == ENOSPC) - goto error_return; - goto error1; - } - - /* - * An error after we've joined dp to the transaction will result in the - * transaction cancel unlocking dp so don't do it explicitly in the - * error path. - */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = false; - - /* - * Also attach the dquot(s) to it, if applicable. - */ - xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); - - if (resblks) - resblks -= XFS_IALLOC_SPACE_RES(mp); - /* - * If the symlink will fit into the inode, write it inline. - */ - if (pathlen <= XFS_IFORK_DSIZE(ip)) { - xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); - memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); - ip->i_d.di_size = pathlen; - - /* - * The inode was initially created in extent format. - */ - ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); - ip->i_df.if_flags |= XFS_IFINLINE; - - ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; - xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); - - } else { - first_fsb = 0; - nmaps = SYMLINK_MAPS; - - error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, - XFS_BMAPI_METADATA, &first_block, resblks, - mval, &nmaps, &free_list); - if (error) - goto error2; - - if (resblks) - resblks -= fs_blocks; - ip->i_d.di_size = pathlen; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - cur_chunk = target_path; - for (n = 0; n < nmaps; n++) { - d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); - byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - BTOBB(byte_cnt), 0); - if (!bp) { - error = ENOMEM; - goto error2; - } - if (pathlen < byte_cnt) { - byte_cnt = pathlen; - } - pathlen -= byte_cnt; - - memcpy(bp->b_addr, cur_chunk, byte_cnt); - cur_chunk += byte_cnt; - - xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1); - } - } - - /* - * Create the directory entry for the symlink. - */ - error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, - &first_block, &free_list, resblks); - if (error) - goto error2; - xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - - /* - * If this is a synchronous mount, make sure that the - * symlink transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - goto error2; - } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); - - *ipp = ip; - return 0; - - error2: - IRELE(ip); - error1: - xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; - error_return: - xfs_trans_cancel(tp, cancel_flags); - xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); - - if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_ILOCK_EXCL); - std_return: - return error; -} - -int xfs_set_dmattrs( xfs_inode_t *ip, u_int evmask, @@ -1927,7 +1444,7 @@ xfs_free_file_space( xfs_mount_t *mp; int nimap; uint resblks; - uint rounding; + xfs_off_t rounding; int rt; xfs_fileoff_t startoffset_fsb; xfs_trans_t *tp; @@ -1956,7 +1473,7 @@ xfs_free_file_space( inode_dio_wait(VFS_I(ip)); } - rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); ioffset = offset & ~(rounding - 1); error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, -1); diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 5163022d9808..38c67c34d73f 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -31,8 +31,7 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, struct xfs_name *target_name); -int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, - xfs_off_t *offset, filldir_t filldir); +int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize); int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, umode_t mode, struct xfs_inode **ipp); int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); |